Update CHANGELOG for v0.9.7-p1

Change-Id: I5490a9cad2d6752832b6bf4ec1835c06a45eeb9b
Don't set the bmi mode when doing error concealment
2011-08-15 17:02:45 -04:00 · 2011-08-15 11:46:04 -04:00 · 2011-08-12 14:59:22 -04:00 · 2011-08-12 14:51:36 -04:00 · 2011-08-12 14:49:35 -04:00 · 2011-08-12 14:49:34 -04:00
351 changed files with 23843 additions and 32557 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -1,2 +1,5 @@
 Adrian Grange <agrange@google.com>
 Johann Koenig <johannkoenig@google.com>
+Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Tom Finegan <tomfinegan@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
--- a/21
+++ b/21
@@ -4,29 +4,50 @@
 Aaron Watry <awatry@gmail.com>
 Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
+Alexis Ballier <aballier@gentoo.org>
+Alok Ahuja <waveletcoeff@gmail.com>
+Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Aron Rosenberg <arosenberg@logitech.com>
+Attila Nagy <attilanagy@google.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Guillermo Ballester Valor <gbvalor@gmail.com>
+Henrik Lundin <hlundin@google.com>
+James Berry <jamesberry@google.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
+Joshua Bleecher Snyder <josh@treelinelabs.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
+Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
 Martin Ettl <ettl.martin78@googlemail.com>
 Michael Kohler <michaelkohler@live.com>
+Mike Hommey <mhommey@mozilla.com>
+Mikhal Shemer <mikhal@google.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Philip Jägenstedt <philipj@opera.com>
+Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Ralph Giles <giles@xiph.org>
+Ronald S. Bultje <rbultje@google.com>
 Scott LaVarnway <slavarnway@google.com>
+Stefan Holmer <holmer@google.com>
+Taekhyun Kim <takim@nvidia.com>
+Tero Rintaluoma <teror@google.com>
+Thijs Vermeir <thijsvermeir@gmail.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Yaowu Xu <yaowu@google.com>
--- a/189
+++ b/189
@@ -1,3 +1,192 @@
+2011-08-15 v0.9.7-p1 "Cayuga" patch 1
+  This is an incremental bugfix release against Cayuga. All users of that
+  release are strongly encouraged to upgrade.
+
+    - Fix potential OOB reads (cdae03a)
+
+          An unbounded out of bounds read was discovered when the
+          decoder was requested to perform error concealment (new in
+          Cayuga) given a frame with corrupt partition sizes.
+
+          A bounded out of bounds read was discovered affecting all
+          versions of libvpx. Given an multipartition input frame that
+          is truncated between the mode/mv partition and the first
+          residiual paritition (in the block of partition offsets), up
+          to 3 extra bytes could have been read from the source buffer.
+          The code will not take any action regardless of the contents
+          of these undefined bytes, as the truncated buffer is detected
+          immediately following the read based on the calculated
+          starting position of the coefficient partition.
+
+    - Fix potential error concealment crash when the very first frame
+      is missing or corrupt (a609be5)
+
+    - Fix significant artifacts in error concealment (a4c2211, 99d870a)
+
+    - Revert 1-pass CBR rate control changes (e961317)
+      Further testing showed this change produced undesirable visual
+      artifacts, rolling back for now.
+
+
+2011-08-02 v0.9.7 "Cayuga"
+  Our third named release, focused on a faster, higher quality, encoder.
+
+  - Upgrading:
+    This release is backwards compatible with Aylesbury (v0.9.5) and
+    Bali (v0.9.6). Users of older releases should refer to the Upgrading
+    notes in this document for that release.
+
+  - Enhancements:
+          Stereo 3D format support for vpxenc
+          Runtime detection of available processor cores.
+          Allow specifying --end-usage by enum name
+          vpxdec: test for frame corruption
+          vpxenc: add quantizer histogram display
+          vpxenc: add rate histogram display
+          Set VPX_FRAME_IS_DROPPABLE
+          update configure for ios sdk 4.3
+          Avoid text relocations in ARM vp8 decoder
+          Generate a vpx.pc file for pkg-config.
+          New ways of passing encoded data between encoder and decoder.
+
+  - Speed:
+      This release includes across-the-board speed improvements to the
+      encoder. On x86, these measure at approximately 11.5% in Best mode,
+      21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
+      On ARM Cortex A9 with Neon extensions, real-time encoding of video
+      telephony content is 35% faster than Bali on single core and 48%
+      faster on multi-core. On the NVidia Tegra2 platform, real time
+      encoding is 40% faster than Bali.
+
+      Decoder speed was not a priority for this release, but improved
+      approximately 8.4% on x86.
+
+          Reduce motion vector search on alt-ref frame.
+          Encoder loopfilter running in its own thread
+          Reworked loopfilter to precalculate more parameters
+          SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
+          Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
+          Removed redundant checks
+          Reduced structure sizes
+          utilize preload in ARMv6 MC/LPF/Copy routines
+          ARM optimized quantization, dfct, variance, subtract
+          Increase chrow row alignment to 16 bytes.
+          disable trellis optimization for first pass
+          Write SSSE3 sub-pixel filter function
+          Improve SSE2 half-pixel filter funtions
+          Add vp8_sub_pixel_variance16x8_ssse3 function
+          Reduce unnecessary distortion computation
+          Use diamond search to replace full search
+          Preload reference area in sub-pixel motion search (real-time mode)
+
+  - Quality:
+      This release focused primarily on one-pass use cases, including
+      video conferencing. Low latency data rate control was significantly
+      improved, improving streamability over bandwidth constrained links.
+      Added support for error concealment, allowing frames to maintain
+      visual quality in the presence of substantial packet loss.
+
+          Add rc_max_intra_bitrate_pct control
+          Limit size of initial keyframe in one-pass.
+          Improve framerate adaptation
+          Improved 1-pass CBR rate control
+          Improved KF insertion after fades to still.
+          Improved key frame detection.
+          Improved activity masking (lower PSNR impact for same SSIM boost)
+          Improved interaction between GF and ARFs
+          Adding error-concealment to the decoder.
+          Adding support for independent partitions
+          Adjusted rate-distortion constants
+
+
+  - Bug Fixes:
+          Removed firstpass motion map
+          Fix parallel make install
+          Fix multithreaded encoding for 1 MB wide frame
+          Fixed iwalsh_neon build problems with RVDS4.1
+          Fix semaphore emulation, spin-wait intrinsics on Windows
+          Fix build with xcode4 and simplify GLOBAL.
+          Mark ARM asm objects as allowing a non-executable stack.
+          Fix vpxenc encoding incorrect webm file header on big endian
+
+
+2011-03-07 v0.9.6 "Bali"
+  Our second named release, focused on a faster, higher quality, encoder.
+
+  - Upgrading:
+    This release is backwards compatible with Aylesbury (v0.9.5). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+  - Enhancements:
+      vpxenc --psnr shows a summary when encode completes
+      --tune=ssim option to enable activity masking
+      improved postproc visualizations for development
+      updated support for Apple iOS to SDK 4.2
+      query decoder to determine which reference frames were updated
+      implemented error tracking in the decoder
+      fix pipe support on windows
+
+  - Speed:
+      Primary focus was on good quality mode, speed 0. Average improvement
+      on x86 about 40%, up to 100% on user-generated content at that speed.
+      Best quality mode speed improved 35%, and realtime speed 10-20%. This
+      release also saw significant improvement in realtime encoding speed
+      on ARM platforms.
+
+        Improved encoder threading
+        Dont pick encoder filter level when loopfilter is disabled.
+        Avoid double copying of key frames into alt and golden buffer
+        FDCT optimizations.
+        x86 sse2 temporal filter
+        SSSE3 version of fast quantizer
+        vp8_rd_pick_best_mbsegmentation code restructure
+        Adjusted breakout RD for SPLITMV
+        Changed segmentation check order
+        Improved rd_pick_intra4x4block
+        Adds armv6 optimized variance calculation
+        ARMv6 optimized sad16x16
+        ARMv6 optimized half pixel variance calculations
+        Full search SAD function optimization in SSE4.1
+        Improve MV prediction accuracy to achieve performance gain
+        Improve MV prediction in vp8_pick_inter_mode() for speed>3
+
+  - Quality:
+      Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
+      also includes support for "activity masking," which greatly improves
+      SSIM at the expense of PSNR. For now, this feature is available with
+      the --tune=ssim option. Further experimentation in this area
+      is ongoing. This release also introduces a new rate control mode
+      called "CQ," which changes the allocation of bits within a clip to
+      the sections where they will have the most visual impact.
+
+        Tuning for the more exact quantizer.
+        Relax rate control for last few frames
+        CQ Mode
+        Limit key frame quantizer for forced key frames.
+        KF/GF Pulsing
+        Add simple version of activity masking.
+        make rdmult adaptive for intra in quantizer RDO
+        cap the best quantizer for 2nd order DC
+        change the threshold of DC check for encode breakout
+
+  - Bug Fixes:
+      Fix crash on Sparc Solaris.
+      Fix counter of fixed keyframe distance
+      ARNR filter pointer update bug fix
+      Fixed use of motion percentage in KF/GF group calc
+      Changed condition for using RD in Intra Mode
+      Fix encoder real-time only configuration.
+      Fix ARM encoder crash with multiple token partitions
+      Fixed bug first cluster timecode of webm file is wrong.
+      Fixed various encoder bugs with odd-sized images
+      vp8e_get_preview fixed when spatial resampling enabled
+      quantizer: fix assertion in fast quantizer path
+      Allocate source buffers to be multiples of 16
+      Fix for manual Golden frame frequency
+      Fix drastic undershoot in long form content
+
+
 2010-10-28 v0.9.5 "Aylesbury"
  Our first named release, focused on a faster decoder, and a better encoder.

--- a/4
+++ b/4
@@ -45,18 +45,14 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv5te-linux-rvct
    armv5te-linux-gcc
    armv5te-symbian-gcc
-    armv5te-wince-vs8
    armv6-darwin-gcc
    armv6-linux-rvct
    armv6-linux-gcc
    armv6-symbian-gcc
-    armv6-wince-vs8
    iwmmxt-linux-rvct
    iwmmxt-linux-gcc
-    iwmmxt-wince-vs8
    iwmmxt2-linux-rvct
    iwmmxt2-linux-gcc
-    iwmmxt2-wince-vs8
    armv7-linux-rvct
    armv7-linux-gcc
    mips32-linux-gcc
--- a/args.c
+++ b/args.c
@@ -135,6 +135,17 @@ void arg_show_usage(FILE *fp, const struct arg_def *const *defs)
                     def->long_name, long_val);

        fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+
+        if(def->enums)
+        {
+            const struct arg_enum_list *listptr;
+
+            fprintf(fp, "  %-37s\t  ", "");
+
+            for(listptr = def->enums; listptr->name; listptr++)
+                fprintf(fp, "%s%s", listptr->name,
+                        listptr[1].name ? ", " : "\n");
+        }
    }
 }

@@ -218,3 +229,37 @@ struct vpx_rational arg_parse_rational(const struct arg *arg)

    return rat;
 }
+
+
+int arg_parse_enum(const struct arg *arg)
+{
+    const struct arg_enum_list *listptr;
+    long int                    rawval;
+    char                       *endptr;
+
+    /* First see if the value can be parsed as a raw value */
+    rawval = strtol(arg->val, &endptr, 10);
+    if (arg->val[0] != '\0' && endptr[0] == '\0')
+    {
+        /* Got a raw value, make sure it's valid */
+        for(listptr = arg->def->enums; listptr->name; listptr++)
+            if(listptr->val == rawval)
+                return rawval;
+    }
+
+    /* Next see if it can be parsed as a string */
+    for(listptr = arg->def->enums; listptr->name; listptr++)
+        if(!strcmp(arg->val, listptr->name))
+            return listptr->val;
+
+    die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+    return 0;
+}
+
+
+int arg_parse_enum_or_int(const struct arg *arg)
+{
+    if(arg->def->enums)
+        return arg_parse_enum(arg);
+    return arg_parse_int(arg);
+}
--- a/args.h
+++ b/args.h
@@ -22,14 +22,23 @@ struct arg
    const struct arg_def  *def;
 };

+struct arg_enum_list
+{
+    const char *name;
+    int         val;
+};
+#define ARG_ENUM_LIST_END {0}
+
 typedef struct arg_def
 {
    const char *short_name;
    const char *long_name;
    int         has_val;
    const char *desc;
+    const struct arg_enum_list *enums;
 } arg_def_t;
-#define ARG_DEF(s,l,v,d) {s,l,v,d}
+#define ARG_DEF(s,l,v,d) {s,l,v,d, NULL}
+#define ARG_DEF_ENUM(s,l,v,d,e) {s,l,v,d,e}
 #define ARG_DEF_LIST_END {0}

 struct arg arg_init(char **argv);
@@ -41,4 +50,5 @@ char **argv_dup(int argc, const char **argv);
 unsigned int arg_parse_uint(const struct arg *arg);
 int arg_parse_int(const struct arg *arg);
 struct vpx_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum_or_int(const struct arg *arg);
 #endif
--- a/build/arm-wince-vs8/armasmv5.rules
+++ b/build/arm-wince-vs8/armasmv5.rules
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<VisualStudioToolFile
-	Name="armasm"
-	Version="8.00"
-	>
-	<Rules>
-		<CustomBuildRule
-			Name="ARMASM"
-			DisplayName="Armasm Assembler"
-			CommandLine="armasm -o &quot;$(IntDir)\$(InputName).obj&quot; $(InputPath) -32 -ARCH 5&#x0D;&#x0A;"
-			Outputs="$(IntDir)\$(InputName).obj"
-			FileExtensions="*.asm"
-			ExecutionDescription="Assembling $(InputName).asm"
-			ShowOnlyRuleProperties="false"
-			>
-			<Properties>
-			</Properties>
-		</CustomBuildRule>
-	</Rules>
-</VisualStudioToolFile>
--- a/build/arm-wince-vs8/armasmv6.rules
+++ b/build/arm-wince-vs8/armasmv6.rules
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<VisualStudioToolFile
-	Name="armasm"
-	Version="8.00"
-	>
-	<Rules>
-		<CustomBuildRule
-			Name="ARMASM"
-			DisplayName="Armasm Assembler"
-			CommandLine="armasm -o &quot;$(IntDir)\$(InputName).obj&quot; $(InputPath) -32 -ARCH 6&#x0D;&#x0A;"
-			Outputs="$(IntDir)\$(InputName).obj"
-			FileExtensions="*.asm"
-			ExecutionDescription="Assembling $(InputName).asm"
-			ShowOnlyRuleProperties="false"
-			>
-			<Properties>
-			</Properties>
-		</CustomBuildRule>
-	</Rules>
-</VisualStudioToolFile>
--- a/build/arm-wince-vs8/armasmxscale.rules
+++ b/build/arm-wince-vs8/armasmxscale.rules
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<VisualStudioToolFile
-	Name="armasm"
-	Version="8.00"
-	>
-	<Rules>
-		<CustomBuildRule
-			Name="ARMASM"
-			DisplayName="Armasm Assembler"
-			CommandLine="armasm -o &quot;$(IntDir)\$(InputName).obj&quot; $(InputPath) -32 -cpu XSCALE&#x0D;&#x0A;"
-			Outputs="$(IntDir)\$(InputName).obj"
-			FileExtensions="*.asm"
-			ExecutionDescription="Assembling $(InputName).asm"
-			ShowOnlyRuleProperties="false"
-			>
-			<Properties>
-			</Properties>
-		</CustomBuildRule>
-	</Rules>
-</VisualStudioToolFile>
--- a/build/arm-wince-vs8/obj_int_extract.bat
+++ b/build/arm-wince-vs8/obj_int_extract.bat
@@ -1,13 +0,0 @@
-@echo off
-REM   Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-REM
-REM   Use of this source code is governed by a BSD-style license
-REM   that can be found in the LICENSE file in the root of the source
-REM   tree. An additional intellectual property rights grant can be found
-REM   in the file PATENTS.  All contributing project authors may
-REM   be found in the AUTHORS file in the root of the source tree.
-echo on
-
-
-cl /I ".\\" /I "..\vp6_decoder_sdk" /I "..\vp6_decoder_sdk\vpx_ports" /D "NDEBUG" /D "_WIN32_WCE=0x420" /D "UNDER_CE" /D "WIN32_PLATFORM_PSPC" /D "WINCE" /D "_LIB" /D "ARM" /D "_ARM_" /D "_UNICODE" /D "UNICODE" /FD /EHsc /MT /GS- /fp:fast /GR- /Fo"Pocket_PC_2003__ARMV4_\%1/" /Fd"Pocket_PC_2003__ARMV4_\%1/vc80.pdb" /W3 /nologo /c /TC ..\vp6_decoder_sdk\vp6_decoder\algo\common\arm\dec_asm_offsets_arm.c
-obj_int_extract.exe rvds "Pocket_PC_2003__ARMV4_\%1/dec_asm_offsets_arm.obj"
--- a/build/arm-wince-vs8/vpx.sln
+++ b/build/arm-wince-vs8/vpx.sln
@@ -1,88 +0,0 @@
-Microsoft Visual Studio Solution File, Format Version 9.00
-# Visual Studio 2005
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example", "example.vcproj", "{BA5FE66F-38DD-E034-F542-B1578C5FB950}"
-	ProjectSection(ProjectDependencies) = postProject
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74} = {DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2} = {E1360C65-D375-4335-8057-7ED99CC3F9B2}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "obj_int_extract", "obj_int_extract.vcproj", "{E1360C65-D375-4335-8057-7ED99CC3F9B2}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vpx", "vpx.vcproj", "{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}"
-	ProjectSection(ProjectDependencies) = postProject
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2} = {E1360C65-D375-4335-8057-7ED99CC3F9B2}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xma", "xma.vcproj", "{A955FC4A-73F1-44F7-135E-30D84D32F022}"
-	ProjectSection(ProjectDependencies) = postProject
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2} = {E1360C65-D375-4335-8057-7ED99CC3F9B2}
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74} = {DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}
-	EndProjectSection
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Mixed Platforms = Debug|Mixed Platforms
-		Debug|Pocket PC 2003 (ARMV4) = Debug|Pocket PC 2003 (ARMV4)
-		Debug|Win32 = Debug|Win32
-		Release|Mixed Platforms = Release|Mixed Platforms
-		Release|Pocket PC 2003 (ARMV4) = Release|Pocket PC 2003 (ARMV4)
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Mixed Platforms.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Mixed Platforms.Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Mixed Platforms.Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Pocket PC 2003 (ARMV4).Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Pocket PC 2003 (ARMV4).Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Win32.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Mixed Platforms.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Mixed Platforms.Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Mixed Platforms.Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Pocket PC 2003 (ARMV4).Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Pocket PC 2003 (ARMV4).Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Win32.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Mixed Platforms.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Mixed Platforms.Build.0 = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Win32.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Win32.Build.0 = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Mixed Platforms.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Mixed Platforms.Build.0 = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Win32.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Win32.Build.0 = Release|Win32
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Mixed Platforms.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Mixed Platforms.Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Mixed Platforms.Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Pocket PC 2003 (ARMV4).Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Pocket PC 2003 (ARMV4).Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Win32.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Mixed Platforms.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Mixed Platforms.Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Mixed Platforms.Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Pocket PC 2003 (ARMV4).Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Pocket PC 2003 (ARMV4).Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Win32.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Mixed Platforms.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Mixed Platforms.Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Mixed Platforms.Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Pocket PC 2003 (ARMV4).Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Pocket PC 2003 (ARMV4).Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Win32.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Mixed Platforms.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Mixed Platforms.Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Mixed Platforms.Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Pocket PC 2003 (ARMV4).Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Pocket PC 2003 (ARMV4).Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Win32.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -98,11 +98,11 @@ install::
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
-	$(qexec)$(CC) $(CFLAGS) -M $< | $(fmt_deps) > $@
+	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -M $< | $(fmt_deps) > $@

 $(BUILD_PFX)%.c.o: %.c
 	$(if $(quiet),@echo "    [CC] $@")
-	$(qexec)$(CC) $(CFLAGS) -c -o $@ $<
+	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $<

 $(BUILD_PFX)%.asm.d: %.asm
 	$(if $(quiet),@echo "    [DEP] $@")
@@ -124,6 +124,12 @@ $(BUILD_PFX)%.s.o: %.s
 	$(if $(quiet),@echo "    [AS] $@")
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<

+.PRECIOUS: %.c.S
+%.c.S: CFLAGS += -DINLINE_ASM
+$(BUILD_PFX)%.c.S: %.c
+	$(if $(quiet),@echo "    [GEN] $@")
+	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
+
 .PRECIOUS: %.asm.s
 $(BUILD_PFX)%.asm.s: %.asm
 	$(if $(quiet),@echo "    [ASM CONVERSION] $@")
@@ -152,8 +158,8 @@ endif
 # Rule to extract assembly constants from C sources
 #
 obj_int_extract: build/make/obj_int_extract.c
-	$(if $(quiet),echo "    [HOSTCC] $@")
-	$(qexec)$(HOSTCC) -I. -o $@ $<
+	$(if $(quiet),@echo "    [HOSTCC] $@")
+	$(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
 CLEAN-OBJS += obj_int_extract

 #
@@ -188,7 +194,7 @@ define linker_template
 $(1): $(filter-out -%,$(2))
 $(1):
 	$(if $(quiet),@echo    "    [LD] $$@")
-	$(qexec)$$(LD) $$(strip $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
+	$(qexec)$$(LD) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
 endef
 # make-3.80 has a bug with expanding large input strings to the eval function,
 # which was triggered in some cases by the following component of
@@ -255,7 +261,7 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),)
 endif

 #
-# Configuration dependant rules
+# Configuration dependent rules
 #
 $(call pairmap,install_map_templates,$(INSTALL_MAPS))

@@ -330,12 +336,10 @@ ifneq ($(call enabled,DIST-SRCS),)
    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_proj.sh
    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules
+    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/obj_int_extract.bat
    DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    #
-    # This isn't really ARCH_ARM dependent, it's dependant on whether we're
-    # using assembly code or not (CONFIG_OPTIMIZATIONS maybe). Just use
-    # this for now.
-    DIST-SRCS-$(ARCH_ARM)    += build/make/obj_int_extract.c
+    # Include obj_int_extract if we use offsets from asm_*_offsets
+    DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
    DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
    DIST-SRCS-yes            += $(target:-$(TOOLCHAIN)=).mk
 endif
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -21,8 +21,14 @@ print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
 print "\t.equ DO1STROUNDING, 0\n";

+# Stack of procedure names.
+@proc_stack = ();
+
 while (<STDIN>)
 {
+    # Load and store alignment
+    s/@/,:/g;
+
    # Comment character
    s/;/@/g;

@@ -79,7 +85,10 @@ while (<STDIN>)
    s/CODE([0-9][0-9])/.code $1/;

    # No AREA required
-    s/^\s*AREA.*$/.text/;
+    # But ALIGNs in AREA must be obeyed
+    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    # If no ALIGN, strip the AREA and align to 4 bytes
+    s/^\s*AREA.*$/.text\n.p2align 2/;

    # DCD to .word
    # This one is for incoming symbols
@@ -114,8 +123,8 @@ while (<STDIN>)
    # put the colon at the end of the line in the macro
    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;

-    # Strip ALIGN
-    s/\sALIGN/@ ALIGN/g;
+    # ALIGN directive
+    s/ALIGN/.balign/g;

    # Strip ARM
    s/\sARM/@ ARM/g;
@@ -127,9 +136,23 @@ while (<STDIN>)
    # Strip PRESERVE8
    s/\sPRESERVE8/@ PRESERVE8/g;

-    # Strip PROC and ENDPROC
-    s/\sPROC/@/g;
-    s/\sENDP/@/g;
+    # Use PROC and ENDP to give the symbols a .size directive.
+    # This makes them show up properly in debugging tools like gdb and valgrind.
+    if (/\bPROC\b/)
+    {
+        my $proc;
+        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        $proc = $1;
+        push(@proc_stack, $proc) if ($proc);
+        s/\bPROC\b/@ $&/;
+    }
+    if (/\bENDP\b/)
+    {
+        my $proc;
+        s/\bENDP\b/@ $&/;
+        $proc = pop(@proc_stack);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+    }

    # EQU directive
    s/(.*)EQU(.*)/.equ $1, $2/;
@@ -148,3 +171,6 @@ while (<STDIN>)
    next if /^\s*END\s*$/;
    print;
 }
+
+# Mark that this object doesn't need an executable stack.
+printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -41,6 +41,9 @@ sub trim($)

 while (<STDIN>)
 {
+    # Load and store alignment
+    s/@/,:/g;
+
    # Comment character
    s/;/@/g;

@@ -97,7 +100,10 @@ while (<STDIN>)
    s/CODE([0-9][0-9])/.code $1/;

    # No AREA required
-    s/^\s*AREA.*$/.text/;
+    # But ALIGNs in AREA must be obeyed
+    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    # If no ALIGN, strip the AREA and align to 4 bytes
+    s/^\s*AREA.*$/.text\n.p2align 2/;

    # DCD to .word
    # This one is for incoming symbols
@@ -137,8 +143,8 @@ while (<STDIN>)
    # put the colon at the end of the line in the macro
    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;

-    # Strip ALIGN
-    s/\sALIGN/@ ALIGN/g;
+    # ALIGN directive
+    s/ALIGN/.balign/g;

    # Strip ARM
    s/\sARM/@ ARM/g;
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh
@@ -17,6 +17,8 @@ for i; do
        on_of=1
    elif [ "$i" == "-v" ]; then
        verbose=1
+    elif [ "$i" == "-g" ]; then
+        args="${args} --debug"
    elif [ "$on_of" == "1" ]; then
        outfile=$i
        on_of=0
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -78,11 +78,12 @@ Build options:
  --log=yes|no|FILE           file configure log is written to [config.err]
  --target=TARGET             target platform tuple [generic-gnu]
  --cpu=CPU                   optimize for a specific cpu rather than a family
+  --extra-cflags=ECFLAGS      add ECFLAGS to CFLAGS [$CFLAGS]
  ${toggle_extra_warnings}    emit harmless warnings (always non-fatal)
  ${toggle_werror}            treat warnings as errors, if possible
                              (not available with all compilers)
  ${toggle_optimizations}     turn on/off compiler optimization flags
-  ${toggle_pic}               turn on/off Position Independant Code
+  ${toggle_pic}               turn on/off Position Independent Code
  ${toggle_ccache}            turn on/off compiler cache
  ${toggle_debug}             enable/disable debug mode
  ${toggle_gprof}             enable/disable gprof profiling instrumentation
@@ -411,11 +412,14 @@ EOF
 write_common_target_config_h() {
    cat > ${TMP_H} << EOF
 /* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_CONFIG_H
+#define VPX_CONFIG_H
 #define RESTRICT    ${RESTRICT}
 EOF
    print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
    print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
    print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
+    echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
    mkdir -p `dirname "$1"`
    cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
 }
@@ -442,6 +446,9 @@ process_common_cmdline() {
        ;;
        --cpu=*) tune_cpu="$optval"
        ;;
+        --extra-cflags=*)
+        extra_cflags="${optval}"
+        ;;
        --enable-?*|--disable-?*)
        eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
        echo "${CMDLINE_SELECT} ${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null || die_unknown $opt
@@ -547,6 +554,10 @@ process_common_toolchain() {
                tgt_isa=universal
                tgt_os=darwin9
                ;;
+            *darwin10*)
+                tgt_isa=x86_64
+                tgt_os=darwin10
+                ;;
            *mingw32*|*cygwin*)
                [ -z "$tgt_isa" ] && tgt_isa=x86
                tgt_os=win32
@@ -606,10 +617,20 @@ process_common_toolchain() {
            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.5.sdk"
            add_ldflags "-mmacosx-version-min=10.5"
            ;;
+        *-darwin10-*)
+            add_cflags  "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
+            add_cflags  "-mmacosx-version-min=10.6"
+            add_ldflags "-isysroot /Developer/SDKs/MacOSX10.6.sdk"
+            add_ldflags "-mmacosx-version-min=10.6"
+            ;;
    esac

    # Handle Solaris variants. Solaris 10 needs -lposix4
    case ${toolchain} in
+        sparc-solaris-*)
+            add_extralibs -lposix4
+            disable fast_unaligned
+            ;;
        *-solaris-*)
            add_extralibs -lposix4
            ;;
@@ -621,8 +642,8 @@ process_common_toolchain() {
    # on arm, isa versions are supersets
    enabled armv7a && soft_enable armv7 ### DEBUG
    enabled armv7 && soft_enable armv6
-    enabled armv6 && soft_enable armv5te
-    enabled armv6 && soft_enable fast_unaligned
+    enabled armv7 || enabled armv6 && soft_enable armv5te
+    enabled armv7 || enabled armv6 && soft_enable fast_unaligned
    enabled iwmmxt2 && soft_enable iwmmxt
    enabled iwmmxt && soft_enable armv5te

@@ -655,7 +676,7 @@ process_common_toolchain() {
                check_add_cflags -march=${tgt_isa}
                check_add_asflags -march=${tgt_isa}
            fi
-
+            enabled debug && add_asflags -g
            asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
            ;;
        rvct)
@@ -671,7 +692,7 @@ process_common_toolchain() {
            if enabled armv7
                then
                    check_add_cflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3
-                    check_add_asflags --cpu=Cortex-A8 --fpu=none
+                    check_add_asflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3
                else
                    check_add_cflags --cpu=${tgt_isa##armv}
                    check_add_asflags --cpu=${tgt_isa##armv}
@@ -680,16 +701,24 @@ process_common_toolchain() {
            arch_int=${tgt_isa##armv}
            arch_int=${arch_int%%te}
            check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
+            enabled debug && add_asflags -g
+            add_cflags --gnu
+            add_cflags --enum_is_int
+            add_cflags --wchar32
        ;;
        esac

        case ${tgt_os} in
+        none*)
+            disable multithread
+            disable os_support
+            ;;
        darwin*)
            SDK_PATH=/Developer/Platforms/iPhoneOS.platform/Developer
            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
            CC=${TOOLCHAIN_PATH}/gcc
            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin9-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
            AS=${TOOLCHAIN_PATH}/as
            STRIP=${TOOLCHAIN_PATH}/strip
            NM=${TOOLCHAIN_PATH}/nm
@@ -703,19 +732,18 @@ process_common_toolchain() {
            add_cflags -arch ${tgt_isa}
            add_ldflags -arch_only ${tgt_isa}

-            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.1.sdk"
+            add_cflags  "-isysroot ${SDK_PATH}/SDKs/iPhoneOS4.3.sdk"

            # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS3.1.sdk
+            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.3.sdk

            # Add the paths for the alternate libc
-#            for d in usr/include usr/include/gcc/darwin/4.0/; do
-            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin9/4.0.1/include/; do
+            for d in usr/include usr/include/gcc/darwin/4.2/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
            done

-            for d in lib usr/lib; do
+            for d in lib usr/lib usr/lib/system; do
                try_dir="${alt_libc}/${d}"
                [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
            done
@@ -726,45 +754,24 @@ process_common_toolchain() {
        linux*)
            enable linux
            if enabled rvct; then
-                # Compiling with RVCT requires an alternate libc (glibc) when
-                # targetting linux.
-                disabled builtin_libc \
-                    || die "Must supply --libc when targetting *-linux-rvct"
+                # Check if we have CodeSourcery GCC in PATH. Needed for
+                # libraries
+                hash arm-none-linux-gnueabi-gcc 2>&- || \
+                  die "Couldn't find CodeSourcery GCC from PATH"

-                # Set up compiler
-                add_cflags --gnu
-                add_cflags --enum_is_int
-                add_cflags --library_interface=aeabi_glibc
-                add_cflags --no_hide_all
-                add_cflags --wchar32
-                add_cflags --dwarf2
-                add_cflags --gnu
+                # Use armcc as a linker to enable translation of
+                # some gcc specific options such as -lm and -lpthread.
+                LD="armcc --translate_gcc"

-                # Set up linker
-                add_ldflags --sysv --no_startup --no_ref_cpp_init
-                add_ldflags --entry=_start
-                add_ldflags --keep '"*(.init)"' --keep '"*(.fini)"'
-                add_ldflags --keep '"*(.init_array)"' --keep '"*(.fini_array)"'
-                add_ldflags --dynamiclinker=/lib/ld-linux.so.3
-                add_extralibs libc.so.6 -lc_nonshared crt1.o crti.o crtn.o
+                # create configuration file (uses path to CodeSourcery GCC)
+                armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg

-                # Add the paths for the alternate libc
-                for d in usr/include; do
-                    try_dir="${alt_libc}/${d}"
-                    [ -d "${try_dir}" ] && add_cflags -J"${try_dir}"
-                done
-                add_cflags -J"${RVCT31INC}"
-                for d in lib usr/lib; do
-                    try_dir="${alt_libc}/${d}"
-                    [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-                done
-
-
-                # glibc has some struct members named __align, which is a
-                # storage modifier in RVCT. If we need to use this modifier,
-                # we'll have to #undef it in our code. Note that this must
-                # happen AFTER all libc inclues.
-                add_cflags -D__align=x_align_x
+                add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+                add_asflags --no_hide_all --apcs=/interwork
+                add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+                enabled pic && add_cflags --apcs=/fpic
+                enabled pic && add_asflags --apcs=/fpic
+                enabled shared && add_cflags --shared
            fi
        ;;

@@ -824,6 +831,7 @@ process_common_toolchain() {
        soft_enable sse2
        soft_enable sse3
        soft_enable ssse3
+        soft_enable sse4_1

        case  ${tgt_os} in
            win*)
@@ -844,7 +852,7 @@ process_common_toolchain() {
                setup_gnu_toolchain
                add_cflags -use-msasm -use-asm
                add_ldflags -i-static
-                enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE3 -axSSE3
+                enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE2 -axSSE2
                enabled x86_64 && AR=xiar
                case ${tune_cpu} in
                    atom*)
@@ -862,6 +870,8 @@ process_common_toolchain() {
                link_with_cc=gcc
                tune_cflags="-march="
            setup_gnu_toolchain
+                #for 32 bit x86 builds, -O3 did not turn on this flag
+                enabled optimizations && check_add_cflags -fomit-frame-pointer
                ;;
        esac

@@ -879,7 +889,7 @@ process_common_toolchain() {
        case  ${tgt_os} in
            win*)
                add_asflags -f win${bits}
-                enabled debug && add_asflags -g dwarf2
+                enabled debug && add_asflags -g cv8
            ;;
            linux*|solaris*)
                add_asflags -f elf${bits}
@@ -929,15 +939,23 @@ process_common_toolchain() {
    enabled gcov &&
        check_add_cflags -fprofile-arcs -ftest-coverage &&
        check_add_ldflags -fprofile-arcs -ftest-coverage
+
    if enabled optimizations; then
-        enabled rvct && check_add_cflags -Otime
+        if enabled rvct; then
+            enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
+        else
            enabled small && check_add_cflags -O2 ||  check_add_cflags -O3
        fi
+    fi

-    # Position Independant Code (PIC) support, for building relocatable
+    # Position Independent Code (PIC) support, for building relocatable
    # shared objects
    enabled gcc && enabled pic && check_add_cflags -fPIC

+    # Work around longjmp interception on glibc >= 2.11, to improve binary
+    # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
+    enabled linux && check_add_cflags -D_FORTIFY_SOURCE=0
+
    # Check for strip utility variant
    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip

@@ -956,11 +974,20 @@ EOF
        esac
    fi

+    # for sysconf(3) and friends.
+    check_header unistd.h
+
    # glibc needs these
    if enabled linux; then
        add_cflags -D_LARGEFILE_SOURCE
        add_cflags -D_FILE_OFFSET_BITS=64
    fi
+
+    # append any user defined extra cflags
+    if [ -n "${extra_cflags}" ] ; then
+        check_add_cflags ${extra_cflags} || \
+        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+    fi
 }

 process_toolchain() {
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -32,7 +32,8 @@ Options:
    --name=project_name         Name of the project (required)
    --proj-guid=GUID            GUID to use for the project
    --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (7,8) of visual studio to generate for
+    --ver=version               Version (7,8,9) of visual studio to generate for
+    --src-path-bare=dir         Path to root of source tree
    -Ipath/to/include           Additional include directories
    -DFLAG[=value]              Preprocessor macros to define
    -Lpath/to/lib               Additional library search paths
@@ -132,7 +133,7 @@ generate_filter() {
    open_tag Filter \
        Name=$name \
        Filter=$pats \
-        UniqueIdentifier=`generate_uuid`
+        UniqueIdentifier=`generate_uuid` \

    file_list_sz=${#file_list[@]}
    for i in ${!file_list[@]}; do
@@ -146,29 +147,19 @@ generate_filter() {
                    for plat in "${platforms[@]}"; do
                        for cfg in Debug Release; do
                            open_tag FileConfiguration \
-                            Name="${cfg}|${plat}"
+                                Name="${cfg}|${plat}" \
+
                            tag Tool \
                                Name="VCCustomBuildTool" \
                                Description="Assembling \$(InputFileName)" \
-                                CommandLine="$(eval echo \$asm_${cfg}_cmdline)"\
-                                Outputs="\$(InputName).obj"
+                                CommandLine="$(eval echo \$asm_${cfg}_cmdline)" \
+                                Outputs="\$(InputName).obj" \
+
                            close_tag FileConfiguration
                        done
                    done
                fi

-                if [ "${f##*.}" == "cpp" ]; then
-                    for plat in "${platforms[@]}"; do
-                        for cfg in Debug Release; do
-                        open_tag FileConfiguration \
-                            Name="${cfg}|${plat}"
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            CompileAs="2"
-                        close_tag FileConfiguration
-                        done
-                    done
-                fi
                close_tag File

                break
@@ -195,24 +186,27 @@ for opt in "$@"; do
        ;;
        --proj-guid=*) guid="${optval}"
        ;;
-    --module-def=*)
-        link_opts="${link_opts} ModuleDefinitionFile=${optval}"
+        --module-def=*) link_opts="${link_opts} ModuleDefinitionFile=${optval}"
        ;;
        --exe) proj_kind="exe"
        ;;
        --lib) proj_kind="lib"
        ;;
+        --src-path-bare=*) src_path_bare="$optval"
+        ;;
        --static-crt) use_static_runtime=true
        ;;
-    --ver=*) vs_ver="$optval"
-             case $optval in
+        --ver=*)
+            vs_ver="$optval"
+            case "$optval" in
                [789])
                ;;
                *) die Unrecognized Visual Studio Version in $opt
                ;;
            esac
        ;;
-    -I*) opt="${opt%/}"
+        -I*)
+            opt="${opt%/}"
            incs="${incs}${incs:+;}&quot;${opt##-I}&quot;"
            yasmincs="${yasmincs} ${opt}"
        ;;
@@ -232,10 +226,13 @@ for opt in "$@"; do
        ;;
        -*) die_unknown $opt
        ;;
-    *) file_list[${#file_list[@]}]="$opt"
+        *)
+            file_list[${#file_list[@]}]="$opt"
            case "$opt" in
-       *.asm) uses_asm=true;;
+                 *.asm) uses_asm=true
+                 ;;
            esac
+        ;;
    esac
 done
 outfile=${outfile:-/dev/stdout}
@@ -278,11 +275,7 @@ done

 # List Keyword for this target
 case "$target" in
-    x86*)
-        keyword="ManagedCProj"
-    ;;
-    arm*|iwmmx*)
-        keyword="Win32Proj"
+    x86*) keyword="ManagedCProj"
    ;;
    *) die "Unsupported target $target!"
 esac
@@ -298,26 +291,7 @@ case "$target" in
        asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
        asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
    ;;
-    arm*|iwmmx*)
-        case "${name}" in
-        obj_int_extract) platforms[0]="Win32"
-        ;;
-        *) platforms[0]="Pocket PC 2003 (ARMV4)"
-        ;;
-        esac
-    ;;
    *) die "Unsupported target $target!"
-esac
-
-# List Command-line Arguments for this target
-case "$target" in
-    arm*|iwmmx*)
-        if [ "$name" == "example" ];then
-            ARGU="--codec vp6 --flipuv --progress _bnd.vp6"
-        fi
-        if [ "$name" == "xma" ];then
-            ARGU="--codec vp6 -h 240 -w 320 -v"
-        fi
    ;;
 esac

@@ -336,7 +310,7 @@ generate_vcproj() {
        Name="${name}" \
        ProjectGUID="{${guid}}" \
        RootNamespace="${name}" \
-                  Keyword="${keyword}"
+        Keyword="${keyword}" \

    open_tag Platforms
    for plat in "${platforms[@]}"; do
@@ -348,21 +322,6 @@ generate_vcproj() {
    case "$target" in
        x86*) $uses_asm && tag ToolFile RelativePath="$self_dirname/../x86-msvs/yasm.rules"
        ;;
-        arm*|iwmmx*)
-            if [ "$name" == "vpx" ];then
-            case "$target" in
-                armv5*)
-                    tag ToolFile RelativePath="$self_dirname/../arm-wince-vs8/armasmv5.rules"
-                ;;
-                armv6*)
-                    tag ToolFile RelativePath="$self_dirname/../arm-wince-vs8/armasmv6.rules"
-                ;;
-                iwmmxt*)
-                    tag ToolFile RelativePath="$self_dirname/../arm-wince-vs8/armasmxscale.rules"
-                ;;
-            esac
-            fi
-        ;;
    esac
    close_tag ToolFiles

@@ -374,68 +333,28 @@ generate_vcproj() {
            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
            ConfigurationType="$vs_ConfigurationType" \
-                      CharacterSet="1"
-
-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-            case "$name" in
-                vpx)         tag Tool \
-                             Name="VCPreBuildEventTool" \
-                             CommandLine="call obj_int_extract.bat \$(ConfigurationName)"
-                             tag Tool \
-                             Name="VCMIDLTool" \
-                             TargetEnvironment="1"
-                             tag Tool \
-                             Name="VCCLCompilerTool" \
-                             ExecutionBucket="7" \
-                             Optimization="0" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;DEBUG;_LIB;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                             MinimalRebuild="true" \
-                             RuntimeLibrary="1" \
-                             BufferSecurityCheck="false" \
-                             UsePrecompiledHeader="0" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="1" \
-                             CompileAs="1"
-                             tag Tool \
-                             Name="VCResourceCompilerTool" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                             Culture="1033" \
-                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                example|xma) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             ExecutionBucket="7" \
-                             Optimization="0" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;DEBUG;_CONSOLE;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                             MinimalRebuild="true" \
-                             RuntimeLibrary="1" \
-                             BufferSecurityCheck="false" \
-                             UsePrecompiledHeader="0" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="1" \
-                             CompileAs="1"
-                             tag Tool \
-                             Name="VCResourceCompilerTool" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                             Culture="1033" \
-                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                obj_int_extract) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             Optimization="0" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE" \
-                             RuntimeLibrary="1" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="1" \
-                ;;
-            esac
-        fi
+            CharacterSet="1" \

        case "$target" in
-            x86*) tag Tool \
+            x86*)
+                case "$name" in
+                    obj_int_extract)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$debug_runtime" \
+                            WarningLevel="3" \
+                            Detect64BitPortabilityProblems="true" \
+                            DebugInformationFormat="1" \
+                    ;;
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+
+                        tag Tool \
                            Name="VCCLCompilerTool" \
                            Optimization="0" \
                            AdditionalIncludeDirectories="$incs" \
@@ -446,42 +365,44 @@ generate_vcproj() {
                            DebugInformationFormat="1" \
                            Detect64BitPortabilityProblems="true" \

-                $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
+                    ;;
+                    *)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$debug_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="1" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
+                    ;;
+                esac
            ;;
        esac

        case "$proj_kind" in
            exe)
                case "$target" in
-                    x86*) tag Tool \
+                    x86*)
+                        case "$name" in
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
+                            ;;
+                            *)
+                                tag Tool \
                                    Name="VCLinkerTool" \
                                    AdditionalDependencies="$debug_libs \$(NoInherit)" \
                                    AdditionalLibraryDirectories="$libdirs" \
                                    GenerateDebugInformation="true" \
                                    ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-
-                    ;;
-                    arm*|iwmmx*)
-                        case "$name" in
-                            obj_int_extract) tag Tool \
-                                Name="VCLinkerTool" \
-                                OutputFile="${name}.exe" \
-                                GenerateDebugInformation="true"
-                            ;;
-                            *) tag Tool \
-                                Name="VCLinkerTool" \
-                                AdditionalDependencies="$debug_libs" \
-                                OutputFile="\$(OutDir)/${name}.exe" \
-                                LinkIncremental="2" \
-                                AdditionalLibraryDirectories="${libdirs};&quot;..\lib/$plat_no_ws&quot;" \
-                                DelayLoadDLLs="\$(NOINHERIT)" \
-                                GenerateDebugInformation="true" \
-                                ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-                                SubSystem="9" \
-                                StackReserveSize="65536" \
-                                StackCommitSize="4096" \
-                                EntryPointSymbol="mainWCRTStartup" \
-                                TargetMachine="3"
                            ;;
                        esac
                    ;;
@@ -489,41 +410,27 @@ generate_vcproj() {
            ;;
            lib)
                case "$target" in
-                      arm*|iwmmx*) tag Tool \
-                                    Name="VCLibrarianTool" \
-                                    AdditionalOptions=" /subsystem:windowsce,4.20 /machine:ARM" \
-                                    OutputFile="\$(OutDir)/${name}.lib" \
-                                ;;
-                                *) tag Tool \
+                    x86*)
+                        tag Tool \
                            Name="VCLibrarianTool" \
                            OutputFile="\$(OutDir)/${name}${lib_sfx}d.lib" \
+
                    ;;
                esac
            ;;
-            dll) tag Tool \
+            dll)
+                tag Tool \
                    Name="VCLinkerTool" \
                    AdditionalDependencies="\$(NoInherit)" \
                    LinkIncremental="2" \
                    GenerateDebugInformation="true" \
                    AssemblyDebug="1" \
                    TargetMachine="1" \
-                      $link_opts
+                    $link_opts \
+
+            ;;
        esac

-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-            case "$name" in
-                vpx)         tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                                ;;
-                example|xma) tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                             tag DebuggerTool \
-                             Arguments="${ARGU}"
-                                ;;
-            esac
-        fi
        close_tag Configuration

        open_tag Configuration \
@@ -532,118 +439,79 @@ generate_vcproj() {
            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
            ConfigurationType="$vs_ConfigurationType" \
            CharacterSet="1" \
-                      WholeProgramOptimization="0"
+            WholeProgramOptimization="0" \

-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
+        case "$target" in
+            x86*)
                case "$name" in
-                vpx)         tag Tool \
-                                     Name="VCPreBuildEventTool" \
-                                     CommandLine="call obj_int_extract.bat \$(ConfigurationName)"
-                             tag Tool \
-                                     Name="VCMIDLTool" \
-                                     TargetEnvironment="1"
+                    obj_int_extract)
                        tag Tool \
                            Name="VCCLCompilerTool" \
-                                             ExecutionBucket="7" \
                            Optimization="2" \
-                                             FavorSizeOrSpeed="1" \
+                            FavorSizeorSpeed="1" \
                            AdditionalIncludeDirectories="$incs" \
-                                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;_LIB;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                                             RuntimeLibrary="0" \
-                                             BufferSecurityCheck="false" \
-                                             UsePrecompiledHeader="0" \
-                                             WarningLevel="3" \
-                                             DebugInformationFormat="0" \
-                                             CompileAs="1"
-                             tag Tool \
-                                             Name="VCResourceCompilerTool" \
-                                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                                             Culture="1033" \
-                                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                example|xma) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             ExecutionBucket="7" \
-                             Optimization="2" \
-                             FavorSizeOrSpeed="1" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;_CONSOLE;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                             RuntimeLibrary="0" \
-                             BufferSecurityCheck="false" \
-                             UsePrecompiledHeader="0" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="0" \
-                             CompileAs="1"
-                             tag Tool \
-                             Name="VCResourceCompilerTool" \
-                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                             Culture="1033" \
-                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                obj_int_extract) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE" \
-                             RuntimeLibrary="0" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$release_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
                            Detect64BitPortabilityProblems="true" \
                            DebugInformationFormat="0" \
                    ;;
-            esac
-        fi
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \

-    case "$target" in
-        x86*) tag       Tool \
+                        tag Tool \
                            Name="VCCLCompilerTool" \
+                            Optimization="2" \
+                            FavorSizeorSpeed="1" \
                            AdditionalIncludeDirectories="$incs" \
                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
                            RuntimeLibrary="$release_runtime" \
                            UsePrecompiledHeader="0" \
                            WarningLevel="3" \
                            DebugInformationFormat="0" \
-                      Detect64BitPortabilityProblems="true"
+                            Detect64BitPortabilityProblems="true" \

                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
                    ;;
+                    *)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            Optimization="2" \
+                            FavorSizeorSpeed="1" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="0" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
+                    ;;
+                esac
+            ;;
        esac

        case "$proj_kind" in
            exe)
                case "$target" in
-                    x86*) tag Tool \
+                    x86*)
+                        case "$name" in
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
+                            ;;
+                            *)
+                                tag Tool \
                                    Name="VCLinkerTool" \
                                    AdditionalDependencies="$libs \$(NoInherit)" \
                                    AdditionalLibraryDirectories="$libdirs" \
-                    ;;
-                    arm*|iwmmx*)
-                        case "$name" in
-                            obj_int_extract) tag Tool \
-                                Name="VCLinkerTool" \
-                                OutputFile="${name}.exe" \
-                                LinkIncremental="1" \
-                                GenerateDebugInformation="false" \
-                                SubSystem="0" \
-                                OptimizeReferences="0" \
-                                EnableCOMDATFolding="0" \
-                                TargetMachine="0"
-                            ;;
-                            *) tag Tool \
-                                Name="VCLinkerTool" \
-                                AdditionalDependencies="$libs" \
-                                OutputFile="\$(OutDir)/${name}.exe" \
-                                LinkIncremental="1" \
-                                AdditionalLibraryDirectories="${libdirs};&quot;..\lib/$plat_no_ws&quot;" \
-                                DelayLoadDLLs="\$(NOINHERIT)" \
-                                GenerateDebugInformation="true" \
-                                ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-                                SubSystem="9" \
-                                StackReserveSize="65536" \
-                                StackCommitSize="4096" \
-                                OptimizeReferences="2" \
-                                EnableCOMDATFolding="2" \
-                                EntryPointSymbol="mainWCRTStartup" \
-                                TargetMachine="3"
+
                            ;;
                        esac
                    ;;
@@ -651,14 +519,11 @@ generate_vcproj() {
            ;;
            lib)
                case "$target" in
-                      arm*|iwmmx*) tag Tool \
-                                    Name="VCLibrarianTool" \
-                                    AdditionalOptions=" /subsystem:windowsce,4.20 /machine:ARM" \
-                                    OutputFile="\$(OutDir)/${name}.lib" \
-                                ;;
-                                *) tag Tool \
+                    x86*)
+                        tag Tool \
                            Name="VCLibrarianTool" \
                            OutputFile="\$(OutDir)/${name}${lib_sfx}.lib" \
+
                    ;;
                esac
            ;;
@@ -669,31 +534,18 @@ generate_vcproj() {
                    LinkIncremental="1" \
                    GenerateDebugInformation="true" \
                    TargetMachine="1" \
-                      $link_opts
-        esac
+                    $link_opts \

-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-            case "$name" in
-                vpx)         tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                ;;
-                example|xma) tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                             tag DebuggerTool \
-                             Arguments="${ARGU}"
            ;;
        esac
-        fi

        close_tag Configuration
    done
    close_tag Configurations

    open_tag Files
-    generate_filter srcs   "Source Files"   "cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-    generate_filter hdrs   "Header Files"   "h;hpp;hxx;hm;inl;inc;xsd"
+    generate_filter srcs   "Source Files"   "c;def;odl;idl;hpj;bat;asm;asmx"
+    generate_filter hdrs   "Header Files"   "h;hm;inl;inc;xsd"
    generate_filter resrcs "Resource Files" "rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
    generate_filter resrcs "Build Files"    "mk"
    close_tag Files
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -139,9 +139,6 @@ process_global() {
            echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
            echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"

-            if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-                echo "${indent}${proj_guid}.${config}.Deploy.0 = ${config}"
-            fi
        done
        IFS=${IFS_bak}
    done
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -0,0 +1,15 @@
+REM   Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+REM
+REM   Use of this source code is governed by a BSD-style license
+REM   that can be found in the LICENSE file in the root of the source
+REM   tree. An additional intellectual property rights grant can be found
+REM   in the file PATENTS.  All contributing project authors may
+REM   be found in the AUTHORS file in the root of the source tree.
+echo on
+
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
+obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"
+obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"
+obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"
--- a/46
+++ b/46
@@ -31,16 +31,18 @@ Advanced options:
  ${toggle_md5}                   support for output of checksum data
  ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
  ${toggle_vp8}                   VP8 codec support
-  ${toggle_psnr}                  output of PSNR data, if supported (encoders)
+  ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
  ${toggle_mem_tracker}           track memory usage
  ${toggle_postproc}              postprocessing
  ${toggle_multithread}           multithreaded encoding and decoding.
  ${toggle_spatial_resampling}    spatial sampling (scaling) support
  ${toggle_realtime_only}         enable this option while building for real-time encoding
+  ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
  ${toggle_runtime_cpu_detect}    runtime cpu detection
  ${toggle_shared}                shared library support
+  ${toggle_static}                static library support
  ${toggle_small}                 favor smaller size over speed
-  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
+  ${toggle_postproc_visualizer}   macro block / block level visualizers

 Codecs:
  Codecs can be selectively enabled or disabled individually, or by family:
@@ -78,22 +80,21 @@ EOF
 # alphabetically by architecture, generic-gnu last.
 all_platforms="${all_platforms} armv5te-linux-rvct"
 all_platforms="${all_platforms} armv5te-linux-gcc"
+all_platforms="${all_platforms} armv5te-none-rvct"
 all_platforms="${all_platforms} armv5te-symbian-gcc"
-all_platforms="${all_platforms} armv5te-wince-vs8"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
+all_platforms="${all_platforms} armv6-none-rvct"
 all_platforms="${all_platforms} armv6-symbian-gcc"
-all_platforms="${all_platforms} armv6-wince-vs8"
 all_platforms="${all_platforms} iwmmxt-linux-rvct"
 all_platforms="${all_platforms} iwmmxt-linux-gcc"
-all_platforms="${all_platforms} iwmmxt-wince-vs8"
 all_platforms="${all_platforms} iwmmxt2-linux-rvct"
 all_platforms="${all_platforms} iwmmxt2-linux-gcc"
-all_platforms="${all_platforms} iwmmxt2-wince-vs8"
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
+all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -114,6 +115,7 @@ all_platforms="${all_platforms} x86-win32-vs7"
 all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
+all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -152,11 +154,13 @@ enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
 enable install_bins
 enable install_libs

+enable static
 enable optimizations
 enable fast_unaligned #allow unaligned accesses, if supported by hw
 enable md5
 enable spatial_resampling
 enable multithread
+enable os_support

 [ -d ${source_path}/../include ] && enable alt_tree_layout
 for d in vp8; do
@@ -199,6 +203,7 @@ ARCH_EXT_LIST="
    sse2
    sse3
    ssse3
+    sse4_1

    altivec
 "
@@ -209,6 +214,7 @@ HAVE_LIST="
    alt_tree_layout
    pthread_h
    sys_mman_h
+    unistd_h
 "
 CONFIG_LIST="
    external_build
@@ -238,7 +244,7 @@ CONFIG_LIST="
    runtime_cpu_detect
    postproc
    multithread
-    psnr
+    internal_stats
    ${CODECS}
    ${CODEC_FAMILIES}
    encoders
@@ -246,9 +252,12 @@ CONFIG_LIST="
    static_msvcrt
    spatial_resampling
    realtime_only
+    error_concealment
    shared
+    static
    small
-    arm_asm_detok
+    postproc_visualizer
+    os_support
 "
 CMDLINE_SELECT="
    extra_warnings
@@ -278,16 +287,18 @@ CMDLINE_SELECT="
    dc_recon
    postproc
    multithread
-    psnr
+    internal_stats
    ${CODECS}
    ${CODEC_FAMILIES}
    static_msvcrt
    mem_tracker
    spatial_resampling
    realtime_only
+    error_concealment
    shared
+    static
    small
-    arm_asm_detok
+    postproc_visualizer
 "

 process_cmdline() {
@@ -295,7 +306,7 @@ process_cmdline() {
        optval="${opt#*=}"
        case "$opt" in
        --disable-codecs) for c in ${CODECS}; do disable $c; done ;;
-        *) process_common_cmdline $opt
+        *) process_common_cmdline "$opt"
        ;;
        esac
    done
@@ -324,8 +335,6 @@ post_process_cmdline() {
    for c in ${CODECS}; do
        enabled ${c} && enable ${c##*_}s
    done
-
-
 }


@@ -376,6 +385,7 @@ process_targets() {
    if [ -f "${source_path}/build/make/version.sh" ]; then
        local ver=`"$source_path/build/make/version.sh" --bare $source_path`
        DIST_DIR="${DIST_DIR}-${ver}"
+        VERSION_STRING=${ver}
        ver=${ver%%-*}
        VERSION_PATCH=${ver##*.}
        ver=${ver%.*}
@@ -384,6 +394,8 @@ process_targets() {
        VERSION_MAJOR=${ver%.*}
    fi
    enabled child || cat <<EOF >> config.mk
+
+PREFIX=${prefix}
 ifeq (\$(MAKECMDGOALS),dist)
 DIST_DIR?=${DIST_DIR}
 else
@@ -391,6 +403,8 @@ DIST_DIR?=\$(DESTDIR)${prefix}
 endif
 LIBSUBDIR=${libdir##${prefix}/}

+VERSION_STRING=${VERSION_STRING}
+
 VERSION_MAJOR=${VERSION_MAJOR}
 VERSION_MINOR=${VERSION_MINOR}
 VERSION_PATCH=${VERSION_PATCH}
@@ -485,7 +499,7 @@ process_toolchain() {
        check_add_cflags -Wpointer-arith
        check_add_cflags -Wtype-limits
        check_add_cflags -Wcast-qual
-        enabled extra_warnings || check_add_cflags -Wno-unused
+        enabled extra_warnings || check_add_cflags -Wno-unused-function
    fi

    if enabled icc; then
@@ -535,6 +549,10 @@ process_toolchain() {

    # Other toolchain specific defaults
    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+
+    if enabled postproc_visualizer; then
+        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
+    fi
 }


--- a/docs.mk
+++ b/docs.mk
@@ -34,7 +34,8 @@ TXT_DOX = $(call enabled,TXT_DOX)

 EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc

-doxyfile: libs.doxy_template libs.doxy examples.doxy
+doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
+doxyfile: libs.doxy_template libs.doxy
 	@echo "    [CREATE] $@"
 	@cat $^ > $@
 	@echo "STRIP_FROM_PATH += $(SRC_PATH_BARE) $(BUILD_ROOT)" >> $@
--- a/examples.mk
+++ b/examples.mk
@@ -17,6 +17,7 @@ vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h vpx_ports/config.h
+vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -28,6 +29,7 @@ vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
+vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += vpx_ports/config.h vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
@@ -75,6 +77,11 @@ GEN_EXAMPLES-$(CONFIG_ENCODERS) += decode_with_drops.c
 endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
+ifeq ($(CONFIG_DECODERS),yes)
+GEN_EXAMPLES-$(CONFIG_ERROR_CONCEALMENT) += decode_with_partial_drops.c
+endif
+decode_with_partial_drops.GUID           = 61C2D026-5754-46AC-916F-1343ECC5537E
+decode_with_partial_drops.DESCRIPTION    = Drops parts of frames while decoding
 GEN_EXAMPLES-$(CONFIG_ENCODERS) += error_resilient.c
 error_resilient.GUID             = DF5837B9-4145-4F92-A031-44E4F832E00C
 error_resilient.DESCRIPTION      = Error Resiliency Feature
@@ -91,8 +98,16 @@ vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame


 # Handle extra library flags depending on codec configuration
-CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m

+# We should not link to math library (libm) on RVCT
+# when building for bare-metal targets
+ifeq ($(CONFIG_OS_SUPPORT), yes)
+CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+else
+    ifeq ($(CONFIG_GCC), yes)
+    CODEC_EXTRA_LIBS-$(CONFIG_VP8)         += m
+    endif
+endif
 #
 # End of specified files. The rest of the build rules should happen
 # automagically from here.
@@ -112,8 +127,8 @@ else
    LIB_PATH := $(call enabled,LIB_PATH)
    INC_PATH := $(call enabled,INC_PATH)
 endif
-CFLAGS += $(addprefix -I,$(INC_PATH))
-LDFLAGS += $(addprefix -L,$(LIB_PATH))
+INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))
+INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))


 # Expand list of selected examples to build (as specified above)
@@ -152,8 +167,10 @@ BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=))

 # Instantiate linker template for all examples.
 CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
+CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a)
 $(foreach bin,$(BINS-yes),\
-    $(if $(BUILD_OBJS),$(eval $(bin): $(LIB_PATH)/lib$(CODEC_LIB).a))\
+    $(if $(BUILD_OBJS),$(eval $(bin):\
+        $(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\
    $(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\
        $(call objs,$($(notdir $(bin)).SRCS)) \
        -l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
@@ -204,7 +221,8 @@ $(1): $($(1:.vcproj=).SRCS)
            --ver=$$(CONFIG_VS_VERSION)\
            --proj-guid=$$($$(@:.vcproj=).GUID)\
            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --out=$$@ $$(CFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) -lwinmm $$^
+            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
+            $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) -lwinmm $$^
 endef
 PROJECTS-$(CONFIG_MSVS) += $(ALL_EXAMPLES:.c=.vcproj)
 INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
--- a/examples/decode_with_partial_drops.txt
+++ b/examples/decode_with_partial_drops.txt
@@ -0,0 +1,238 @@
+@TEMPLATE decoder_tmpl.c
+Decode With Partial Drops Example
+=========================
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+This is an example utility which drops a series of frames (or parts of frames),
+as specified on the command line. This is useful for observing the error
+recovery features of the codec.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+#include <time.h>
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+struct parsed_header
+{
+    char key_frame;
+    int version;
+    char show_frame;
+    int first_part_size;
+};
+
+int next_packet(struct parsed_header* hdr, int pos, int length, int mtu)
+{
+    int size = 0;
+    int remaining = length - pos;
+    /* Uncompressed part is 3 bytes for P frames and 10 bytes for I frames */
+    int uncomp_part_size = (hdr->key_frame ? 10 : 3);
+    /* number of bytes yet to send from header and the first partition */
+    int remainFirst = uncomp_part_size + hdr->first_part_size - pos;
+    if (remainFirst > 0)
+    {
+        if (remainFirst <= mtu)
+        {
+            size = remainFirst;
+        }
+        else
+        {
+            size = mtu;
+        }
+
+        return size;
+    }
+
+    /* second partition; just slot it up according to MTU */
+    if (remaining <= mtu)
+    {
+        size = remaining;
+        return size;
+    }
+    return mtu;
+}
+
+void throw_packets(unsigned char* frame, int* size, int loss_rate,
+                   int* thrown, int* kept)
+{
+    unsigned char loss_frame[256*1024];
+    int pkg_size = 1;
+    int pos = 0;
+    int loss_pos = 0;
+    struct parsed_header hdr;
+    unsigned int tmp;
+    int mtu = 1500;
+
+    if (*size < 3)
+    {
+        return;
+    }
+    putc('|', stdout);
+    /* parse uncompressed 3 bytes */
+    tmp = (frame[2] << 16) | (frame[1] << 8) | frame[0];
+    hdr.key_frame = !(tmp & 0x1); /* inverse logic */
+    hdr.version = (tmp >> 1) & 0x7;
+    hdr.show_frame = (tmp >> 4) & 0x1;
+    hdr.first_part_size = (tmp >> 5) & 0x7FFFF;
+
+    /* don't drop key frames */
+    if (hdr.key_frame)
+    {
+        int i;
+        *kept = *size/mtu + ((*size % mtu > 0) ? 1 : 0); /* approximate */
+        for (i=0; i < *kept; i++)
+            putc('.', stdout);
+        return;
+    }
+
+    while ((pkg_size = next_packet(&hdr, pos, *size, mtu)) > 0)
+    {
+        int loss_event = ((rand() + 1.0)/(RAND_MAX + 1.0) < loss_rate/100.0);
+        if (*thrown == 0 && !loss_event)
+        {
+            memcpy(loss_frame + loss_pos, frame + pos, pkg_size);
+            loss_pos += pkg_size;
+            (*kept)++;
+            putc('.', stdout);
+        }
+        else
+        {
+            (*thrown)++;
+            putc('X', stdout);
+        }
+        pos += pkg_size;
+    }
+    memcpy(frame, loss_frame, loss_pos);
+    memset(frame + loss_pos, 0, *size - loss_pos);
+    *size = loss_pos;
+}
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
+/* Initialize codec */
+flags = VPX_CODEC_USE_ERROR_CONCEALMENT;
+res = vpx_codec_dec_init(&codec, interface, &dec_cfg, flags);
+if(res)
+    die_codec(&codec, "Failed to initialize decoder");
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
+
+Usage
+-----
+This example adds a single argument to the `simple_decoder` example,
+which specifies the range or pattern of frames to drop. The parameter is
+parsed as follows:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+if(argc < 4 || argc > 6)
+    die("Usage: %s <infile> <outfile> [-t <num threads>] <N-M|N/M|L,S>\n",
+        argv[0]);
+{
+    char *nptr;
+    int arg_num = 3;
+    if (argc == 6 && strncmp(argv[arg_num++], "-t", 2) == 0)
+        dec_cfg.threads = strtol(argv[arg_num++], NULL, 0);
+    n = strtol(argv[arg_num], &nptr, 0);
+    mode = (*nptr == '\0' || *nptr == ',') ? 2 : (*nptr == '-') ? 1 : 0;
+
+    m = strtol(nptr+1, NULL, 0);
+    if((!n && !m) || (*nptr != '-' && *nptr != '/' &&
+        *nptr != '\0' && *nptr != ','))
+        die("Couldn't parse pattern %s\n", argv[3]);
+}
+seed = (m > 0) ? m : (unsigned int)time(NULL);
+srand(seed);thrown_frame = 0;
+printf("Seed: %u\n", seed);
+printf("Threads: %d\n", dec_cfg.threads);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
+
+
+Dropping A Range Of Frames
+--------------------------
+To drop a range of frames, specify the starting frame and the ending
+frame to drop, separated by a dash. The following command will drop
+frames 5 through 10 (base 1).
+
+  $ ./decode_with_partial_drops in.ivf out.i420 5-10
+
+
+Dropping A Pattern Of Frames
+----------------------------
+To drop a pattern of frames, specify the number of frames to drop and
+the number of frames after which to repeat the pattern, separated by
+a forward-slash. The following command will drop 3 of 7 frames.
+Specifically, it will decode 4 frames, then drop 3 frames, and then
+repeat.
+
+  $ ./decode_with_partial_drops in.ivf out.i420 3/7
+
+Dropping Random Parts Of Frames
+-------------------------------
+A third argument tuple is available to split the frame into 1500 bytes pieces
+and randomly drop pieces rather than frames. The frame will be split at
+partition boundaries where possible. The following example will seed the RNG
+with the seed 123 and drop approximately 5% of the pieces. Pieces which
+are depending on an already dropped piece will also be dropped.
+
+  $ ./decode_with_partial_drops in.ivf out.i420 5,123
+
+
+Extra Variables
+---------------
+This example maintains the pattern passed on the command line in the
+`n`, `m`, and `is_range` variables:
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+int              n, m, mode;
+unsigned int     seed;
+int              thrown=0, kept=0;
+int              thrown_frame=0, kept_frame=0;
+vpx_codec_dec_cfg_t  dec_cfg = {0};
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
+
+
+Making The Drop Decision
+------------------------
+The example decides whether to drop the frame based on the current
+frame number, immediately before decoding the frame.
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
+/* Decide whether to throw parts of the frame or the whole frame
+   depending on the drop mode */
+thrown_frame = 0;
+kept_frame = 0;
+switch (mode)
+{
+case 0:
+    if (m - (frame_cnt-1)%m <= n)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 1:
+    if (frame_cnt >= n && frame_cnt <= m)
+    {
+        frame_sz = 0;
+    }
+    break;
+case 2:
+    throw_packets(frame, &frame_sz, n, &thrown_frame, &kept_frame);
+    break;
+default: break;
+}
+if (mode < 2)
+{
+    if (frame_sz == 0)
+    {
+        putc('X', stdout);
+        thrown_frame++;
+    }
+    else
+    {
+        putc('.', stdout);
+        kept_frame++;
+    }
+}
+thrown += thrown_frame;
+kept += kept_frame;
+fflush(stdout);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
--- a/examples/decoder_tmpl.c
+++ b/examples/decoder_tmpl.c
@@ -19,7 +19,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
-#define interface (&vpx_codec_vp8_dx_algo)
+#define interface (vpx_codec_vp8_dx())
@EXTRA_INCLUDES


@@ -42,6 +42,8 @@ static void die(const char *fmt, ...) {

@DIE_CODEC

+@HELPERS
+
 int main(int argc, char **argv) {
    FILE            *infile, *outfile;
    vpx_codec_ctx_t  codec;
--- a/examples/decoder_tmpl.txt
+++ b/examples/decoder_tmpl.txt
@@ -2,7 +2,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
-#define interface (&vpx_codec_vp8_dx_algo)
+#define interface (vpx_codec_vp8_dx())
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES


--- a/examples/encoder_tmpl.c
+++ b/examples/encoder_tmpl.c
@@ -19,7 +19,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
-#define interface (&vpx_codec_vp8_cx_algo)
+#define interface (vpx_codec_vp8_cx())
 #define fourcc    0x30385056
@EXTRA_INCLUDES

@@ -111,8 +111,6 @@ int main(int argc, char **argv) {
    vpx_codec_ctx_t      codec;
    vpx_codec_enc_cfg_t  cfg;
    int                  frame_cnt = 0;
-    unsigned char        file_hdr[IVF_FILE_HDR_SZ];
-    unsigned char        frame_hdr[IVF_FRAME_HDR_SZ];
    vpx_image_t          raw;
    vpx_codec_err_t      res;
    long                 width;
--- a/examples/encoder_tmpl.txt
+++ b/examples/encoder_tmpl.txt
@@ -2,7 +2,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
-#define interface (&vpx_codec_vp8_cx_algo)
+#define interface (vpx_codec_vp8_cx())
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES


--- a/examples/postproc.txt
+++ b/examples/postproc.txt
@@ -21,7 +21,7 @@ res = vpx_codec_dec_init(&codec, interface, NULL,
 if(res == VPX_CODEC_INCAPABLE) {
    printf("NOTICE: Postproc not supported by %s\n",
           vpx_codec_iface_name(interface));
-    res = vpx_codec_dec_init(&codec, interface, NULL, 0);
+    res = vpx_codec_dec_init(&codec, interface, NULL, flags);
 }
 if(res)
    die_codec(&codec, "Failed to initialize decoder");
--- a/examples/simple_decoder.txt
+++ b/examples/simple_decoder.txt
@@ -33,7 +33,7 @@ Initializing The Codec
 ----------------------
 The decoder is initialized by the following code. This is an example for
 the VP8 decoder, but the code is analogous for all algorithms. Replace
-`&vpx_codec_vp8_dx_algo` with a pointer to the interface exposed by the
+`vpx_codec_vp8_dx()` with a pointer to the interface exposed by the
 algorithm you want to use. The `cfg` argument is left as NULL in this
 example, because we want the algorithm to determine the stream
 configuration (width/height) and allocate memory automatically. This
--- a/examples/vp8_set_maps.txt
+++ b/examples/vp8_set_maps.txt
@@ -78,8 +78,8 @@ if(frame_cnt + 1 == 22) {
 } else if(frame_cnt + 1 == 44) {
    vpx_active_map_t  active;

-    active.rows = 240/16;
-    active.cols = 320/16;
+    active.rows = cfg.g_h/16;
+    active.cols = cfg.g_w/16;

    /* pass in null map to disable active_map*/
    active.active_map = NULL;
--- a/libmkv/EbmlIDs.h
+++ b/libmkv/EbmlIDs.h
@@ -120,7 +120,7 @@ enum mkv
    //video
    Video = 0xE0,
    FlagInterlaced = 0x9A,
-//  StereoMode = 0x53B8,
+    StereoMode = 0x53B8,
    PixelWidth = 0xB0,
    PixelHeight = 0xBA,
    PixelCropBottom = 0x54AA,
--- a/libmkv/EbmlWriter.c
+++ b/libmkv/EbmlWriter.c
@@ -11,6 +11,7 @@
 #include <stdlib.h>
 #include <wchar.h>
 #include <string.h>
+#include <limits.h>
 #if defined(_MSC_VER)
 #define LITERALU64(n) n
 #else
@@ -33,7 +34,7 @@ void Ebml_WriteLen(EbmlGlobal *glob, long long val)

    val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7));

-    Ebml_Serialize(glob, (void *) &val, size);
+    Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
 }

 void Ebml_WriteString(EbmlGlobal *glob, const char *str)
@@ -60,21 +61,26 @@ void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr)

 void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id)
 {
+    int len;
+
    if (class_id >= 0x01000000)
-        Ebml_Serialize(glob, (void *)&class_id, 4);
+        len = 4;
    else if (class_id >= 0x00010000)
-        Ebml_Serialize(glob, (void *)&class_id, 3);
+        len = 3;
    else if (class_id >= 0x00000100)
-        Ebml_Serialize(glob, (void *)&class_id, 2);
+        len = 2;
    else
-        Ebml_Serialize(glob, (void *)&class_id, 1);
+        len = 1;
+
+    Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
 }
+
 void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui)
 {
    unsigned char sizeSerialized = 8 | 0x80;
    Ebml_WriteID(glob, class_id);
-    Ebml_Serialize(glob, &sizeSerialized, 1);
-    Ebml_Serialize(glob, &ui, 8);
+    Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+    Ebml_Serialize(glob, &ui, sizeof(ui), 8);
 }

 void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui)
@@ -97,8 +103,8 @@ void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned l
    }

    sizeSerialized = 0x80 | size;
-    Ebml_Serialize(glob, &sizeSerialized, 1);
-    Ebml_Serialize(glob, &ui, size);
+    Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+    Ebml_Serialize(glob, &ui, sizeof(ui), size);
 }
 //TODO: perhaps this is a poor name for this id serializer helper function
 void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin)
@@ -119,14 +125,14 @@ void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d)
    unsigned char len = 0x88;

    Ebml_WriteID(glob, class_id);
-    Ebml_Serialize(glob, &len, 1);
-    Ebml_Serialize(glob,  &d, 8);
+    Ebml_Serialize(glob, &len, sizeof(len), 1);
+    Ebml_Serialize(glob,  &d, sizeof(d), 8);
 }

 void Ebml_WriteSigned16(EbmlGlobal *glob, short val)
 {
    signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
-    Ebml_Serialize(glob, &out, 3);
+    Ebml_Serialize(glob, &out, sizeof(out), 3);
 }

 void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s)
@@ -143,7 +149,6 @@ void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s)

 void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length)
 {
-    unsigned char size = 4;
    Ebml_WriteID(glob, class_id);
    Ebml_WriteLen(glob, data_length);
    Ebml_Write(glob,  data, data_length);
--- a/libmkv/EbmlWriter.h
+++ b/libmkv/EbmlWriter.h
@@ -15,7 +15,7 @@
 #include "vpx/vpx_integer.h"

 typedef struct EbmlGlobal EbmlGlobal;
-void  Ebml_Serialize(EbmlGlobal *glob, const void *, unsigned long);
+void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
 void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
 /////

--- a/libmkv/WebMElement.c
+++ b/libmkv/WebMElement.c
@@ -35,11 +35,11 @@ void writeSimpleBlock(EbmlGlobal *glob, unsigned char trackNumber, short timeCod
    Ebml_WriteID(glob, SimpleBlock);
    unsigned long blockLength = 4 + dataLength;
    blockLength |= 0x10000000; //TODO check length < 0x0FFFFFFFF
-    Ebml_Serialize(glob, &blockLength, 4);
+    Ebml_Serialize(glob, &blockLength, sizeof(blockLength), 4);
    trackNumber |= 0x80;  //TODO check track nubmer < 128
    Ebml_Write(glob, &trackNumber, 1);
    //Ebml_WriteSigned16(glob, timeCode,2); //this is 3 bytes
-    Ebml_Serialize(glob, &timeCode, 2);
+    Ebml_Serialize(glob, &timeCode, sizeof(timeCode), 2);
    unsigned char flags = 0x00 | (isKeyframe ? 0x80 : 0x00) | (lacingFlag << 1) | discardable;
    Ebml_Write(glob, &flags, 1);
    Ebml_Write(glob, data, dataLength);
--- a/libs.mk
+++ b/libs.mk
@@ -9,7 +9,13 @@
 ##


-ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+# ARM assembly files are written in RVCT-style. We use some make magic to
+# filter those files to allow GCC compilation
+ifeq ($(ARCH_ARM),yes)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+else
+  ASM:=.asm
+endif

 CODEC_SRCS-yes += libs.mk

@@ -29,6 +35,7 @@ ifeq ($(CONFIG_VP8_ENCODER),yes)
  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
  CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
+  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
@@ -41,6 +48,7 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
  CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
  CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
  CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
+  CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8dx_arm.mk
  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
@@ -83,6 +91,7 @@ $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)

 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
@@ -94,7 +103,7 @@ CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c
 endif
 CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c
-CODEC_SRCS-$(ARCH_ARM) += $(BUILD_PFX)vpx_config.asm
+CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm.h
 CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
@@ -115,7 +124,7 @@ INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/v
 INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.exp)
 endif
 else
-INSTALL-LIBS-yes += $(LIBSUBDIR)/libvpx.a
+INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a
 INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
 endif

@@ -123,31 +132,33 @@ CODEC_SRCS=$(call enabled,CODEC_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)

+
+# Generate a list of all enabled sources, in particular for exporting to gyp
+# based build systems.
+libvpx_srcs.txt:
+	@echo "    [CREATE] $@"
+	@echo $(CODEC_SRCS) | xargs -n1 echo | sort -u > $@
+
+
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)

-ifeq ($(ARCH_ARM),yes)
-ifeq ($(HAVE_ARMV5TE),yes)
-ARM_ARCH=v5
-endif
-ifeq ($(HAVE_ARMV6),yes)
-ARM_ARCH=v6
-endif
 obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
-	@cp $(SRC_PATH_BARE)/build/arm-wince-vs8/obj_int_extract.bat .
+	@cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat .
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-			--exe\
-			--target=$(TOOLCHAIN)\
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+    --exe \
+    --target=$(TOOLCHAIN) \
+    --name=obj_int_extract \
+    --ver=$(CONFIG_VS_VERSION) \
+    --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
    $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=obj_int_extract\
-            --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2\
-            --out=$@ $^\
-            -I".&quot;;&quot;$(SRC_PATH_BARE)"
+    --out=$@ $^ \
+    -I. \
+    -I"$(SRC_PATH_BARE)" \

 PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj
 PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat
-endif

 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
@@ -158,15 +169,16 @@ CLEAN-OBJS += vpx.def

 vpx.vcproj: $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-			--lib\
-			--target=$(TOOLCHAIN)\
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+			--lib \
+			--target=$(TOOLCHAIN) \
            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vpx\
-            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74\
-            --module-def=vpx.def\
-            --ver=$(CONFIG_VS_VERSION)\
-            --out=$@ $(CFLAGS) $^\
+            --name=vpx \
+            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
+            --module-def=vpx.def \
+            --ver=$(CONFIG_VS_VERSION) \
+            --out=$@ $(CFLAGS) $^ \
+            --src-path-bare="$(SRC_PATH_BARE)" \

 PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj

@@ -176,14 +188,15 @@ endif
 else
 LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
 OBJS-$(BUILD_LIBVPX) += $(LIBVPX_OBJS)
-LIBS-$(BUILD_LIBVPX) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
+LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)

 BUILD_LIBVPX_SO         := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
 LIBVPX_SO               := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
-LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)
+LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
+                           $(notdir $(LIBVPX_SO_SYMLINKS))
 $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) libvpx.ver
-$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm -pthread
+$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
 $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): SO_VERSION_SCRIPT = libvpx.ver
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
@@ -197,12 +210,41 @@ libvpx.ver: $(call enabled,CODEC_EXPORTS)
 	$(qexec)echo "local: *; };" >> $@
 CLEAN-OBJS += libvpx.ver

-$(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)):
-	@echo "    [LN]      $@"
-	$(qexec)ln -sf $(LIBVPX_SO) $@
+define libvpx_symlink_template
+$(1): $(2)
+	@echo "    [LN]      $$@"
+	$(qexec)ln -sf $(LIBVPX_SO) $$@
+endef
+
+$(eval $(call libvpx_symlink_template,\
+    $(addprefix $(BUILD_PFX),$(notdir $(LIBVPX_SO_SYMLINKS))),\
+    $(BUILD_PFX)$(LIBVPX_SO)))
+$(eval $(call libvpx_symlink_template,\
+    $(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)),\
+    $(DIST_DIR)/$(LIBSUBDIR)/$(LIBVPX_SO)))

 INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS)
 INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO)
+
+LIBS-$(BUILD_LIBVPX) += vpx.pc
+vpx.pc: config.mk libs.mk
+	@echo "    [CREATE] $@"
+	$(qexec)echo '# pkg-config file from libvpx $(VERSION_STRING)' > $@
+	$(qexec)echo 'prefix=$(PREFIX)' >> $@
+	$(qexec)echo 'exec_prefix=$${prefix}' >> $@
+	$(qexec)echo 'libdir=$${prefix}/lib' >> $@
+	$(qexec)echo 'includedir=$${prefix}/include' >> $@
+	$(qexec)echo '' >> $@
+	$(qexec)echo 'Name: vpx' >> $@
+	$(qexec)echo 'Description: WebM Project VPx codec implementation' >> $@
+	$(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@
+	$(qexec)echo 'Requires:' >> $@
+	$(qexec)echo 'Conflicts:' >> $@
+	$(qexec)echo 'Libs: -L$${libdir} -lvpx' >> $@
+	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
+INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
+INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
+CLEAN-OBJS += vpx.pc
 endif

 LIBS-$(LIPO_LIBVPX) += libvpx.a
@@ -230,9 +272,52 @@ endif
 #
 # Add assembler dependencies for configuration and offsets
 #
-#$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(BUILD_PFX)vpx_asm_offsets.asm
 $(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
-$(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+
+#
+# Calculate platform- and compiler-specific offsets for hand coded assembly
+#
+
+ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
+    $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
+	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+    $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
+
+    $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+    $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
+
+    $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+	grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
+    $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
+    CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
+else
+  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
+    asm_com_offsets.asm: obj_int_extract
+    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+    CLEAN-OBJS += asm_com_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+
+    asm_enc_offsets.asm: obj_int_extract
+    asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+    CLEAN-OBJS += asm_enc_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
+
+    asm_dec_offsets.asm: obj_int_extract
+    asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+    CLEAN-OBJS += asm_dec_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+  endif
+endif

 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
--- a/mainpage.dox
+++ b/mainpage.dox
@@ -31,7 +31,7 @@
  The WebM project is an open source project supported by its community. For
  questions about this SDK, please mail the apps-devel@webmproject.org list.
  To contribute, see http://www.webmproject.org/code/contribute and mail
-  vpx-devel@webmproject.org.
+  codec-devel@webmproject.org.
 */

 /*!\page changelog CHANGELOG
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -20,8 +20,6 @@
 * Still in the public domain.
 */

-#include <sys/types.h>    /* for stupid systems */
-
 #include <string.h>   /* for memcpy() */

 #include "md5_utils.h"
--- a/solution.mk
+++ b/solution.mk
@@ -9,38 +9,13 @@
 ##


-ifeq ($(ARCH_ARM),yes)
-ARM_DEVELOP=no
-ARM_DEVELOP:=$(if $(filter %vpx.vcproj,$(wildcard *.vcproj)),yes)
-
-ifeq ($(ARM_DEVELOP),yes)
-vpx.sln:
-	@echo "    [COPY] $@"
-	@cp $(SRC_PATH_BARE)/build/arm-wince-vs8/vpx.sln .
-PROJECTS-yes += vpx.sln
-else
-vpx.sln: $(wildcard *.vcproj)
-	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
-            $(if $(filter %vpx.vcproj,$^),--dep=vpxdec:vpx) \
-            $(if $(filter %vpx.vcproj,$^),--dep=xma:vpx) \
-            --ver=$(CONFIG_VS_VERSION)\
-            --target=$(TOOLCHAIN)\
-            --out=$@ $^
-vpx.sln.mk: vpx.sln
-	@true
-
-PROJECTS-yes += vpx.sln vpx.sln.mk
-include vpx.sln.mk
-endif
-
-else
 vpx.sln: $(wildcard *.vcproj)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
            $(if $(filter %vpx.vcproj,$^),\
-                $(foreach vcp,$(filter-out %vpx.vcproj,$^),\
+                $(foreach vcp,$(filter-out %vpx.vcproj %obj_int_extract.vcproj,$^),\
                  --dep=$(vcp:.vcproj=):vpx)) \
+            --dep=vpx:obj_int_extract \
            --ver=$(CONFIG_VS_VERSION)\
            --out=$@ $^
 vpx.sln.mk: vpx.sln
@@ -48,7 +23,6 @@ vpx.sln.mk: vpx.sln

 PROJECTS-yes += vpx.sln vpx.sln.mk
 -include vpx.sln.mk
-endif

 # Always install this file, as it is an unconditional post-build rule.
 INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
--- a/vp8/common/vpx_ref_build_prefix.h
+++ b/vp8/common/vpx_ref_build_prefix.h
@@ -7,18 +7,18 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
-
-
-#ifndef _VPX_REF_BUILD_PREFIX_h
-#define _VPX_REF_BUILD_PREFIX_h
-
-#if defined(__cplusplus)
-extern "C" {
+#include <stdio.h>
+#include "tools_common.h"
+#ifdef _WIN32
+#include <io.h>
+#include <fcntl.h>
 #endif

-
-#if defined(__cplusplus)
+FILE* set_binary_mode(FILE *stream)
+{
+    (void)stream;
+#ifdef _WIN32
+    _setmode(_fileno(stream), _O_BINARY);
+#endif
+    return stream;
 }
-#endif
-
-#endif /* include guards */
--- a/vp8/common/vpxerrors.h
+++ b/vp8/common/vpxerrors.h
@@ -7,7 +7,10 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#ifndef TOOLS_COMMON_H
+#define TOOLS_COMMON_H

+/* Sets a stdio stream into binary mode */
+FILE* set_binary_mode(FILE *stream);

-
-#define ALLOC_FAILURE -2
+#endif
--- a/usage.dox
+++ b/usage.dox
@@ -25,7 +25,7 @@
    codec may write into to store details about a single instance of that codec.
    Most of the context is implementation specific, and thus opaque to the
    application. The context structure as seen by the application is of fixed
-    size, and thus can be allocated eith with automatic storage or dynamically
+    size, and thus can be allocated with automatic storage or dynamically
    on the heap.

    Most operations require an initialized codec context. Codec context
@@ -74,7 +74,7 @@
    the ABI is versioned. The ABI version number must be passed at
    initialization time to ensure the application is using a header file that
    matches the library. The current ABI version number is stored in the
-    prepropcessor macros #VPX_CODEC_ABI_VERSION, #VPX_ENCODER_ABI_VERSION, and
+    preprocessor macros #VPX_CODEC_ABI_VERSION, #VPX_ENCODER_ABI_VERSION, and
    #VPX_DECODER_ABI_VERSION. For convenience, each initialization function has
    a wrapper macro that inserts the correct version number. These macros are
    named like the initialization methods, but without the _ver suffix.
@@ -125,7 +125,7 @@

    The special value <code>0</code> is reserved to represent an infinite
    deadline. In this case, the codec will perform as much processing as
-    possible to yeild the highest quality frame.
+    possible to yield the highest quality frame.

    By convention, the value <code>1</code> is used to mean "return as fast as
    possible."
@@ -135,7 +135,7 @@

 /*! \page usage_xma External Memory Allocation
    Applications that wish to have fine grained control over how and where
-    decoders allocate memory \ref MAY make use of the e_xternal Memory Allocation
+    decoders allocate memory \ref MAY make use of the eXternal Memory Allocation
    (XMA) interface. Not all codecs support the XMA \ref usage_features.

    To use a decoder in XMA mode, the decoder \ref MUST be initialized with the
@@ -143,7 +143,7 @@
    allocate is heavily dependent on the size of the encoded video frames. The
    size of the video must be known before requesting the decoder's memory map.
    This stream information can be obtained with the vpx_codec_peek_stream_info()
-    function, which does not require a contructed decoder context. If the exact
+    function, which does not require a constructed decoder context. If the exact
    stream is not known, a stream info structure can be created that reflects
    the maximum size that the decoder instance is required to support.

@@ -175,7 +175,7 @@
    \section usage_xma_seg_szalign Segment Size and Alignment
    The sz (size) and align (alignment) parameters describe the required size
    and alignment of the requested segment. Alignment will always be a power of
-    two. Applications \ref MUST honor the aligment requested. Failure to do so
+    two. Applications \ref MUST honor the alignment requested. Failure to do so
    could result in program crashes or may incur a speed penalty.

    \section usage_xma_seg_flags Segment Flags
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -16,18 +16,20 @@
 #include "findnearmv.h"
 #include "entropymode.h"
 #include "systemdependent.h"
-#include "vpxerrors.h"


 extern  void vp8_init_scan_order_mask();

-void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols)
+static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 {
    int i;
    vpx_memset(mi - cols - 2, 0, sizeof(MODE_INFO) * (cols + 1));

    for (i = 0; i < rows; i++)
    {
+        /* TODO(holmer): Bug? This updates the last element of each row
+         * rather than the border element!
+         */
        vpx_memset(&mi[i*cols-1], 0, sizeof(MODE_INFO));
    }
 }
@@ -44,9 +46,11 @@ void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)

    vpx_free(oci->above_context);
    vpx_free(oci->mip);
+    vpx_free(oci->prev_mip);

    oci->above_context = 0;
    oci->mip = 0;
+    oci->prev_mip = 0;

 }

@@ -66,12 +70,12 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)

    for (i = 0; i < NUM_YV12_BUFFERS; i++)
    {
-      oci->fb_idx_ref_cnt[0] = 0;
-
+        oci->fb_idx_ref_cnt[i] = 0;
+        oci->yv12_fb[i].flags = 0;
        if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
        {
            vp8_de_alloc_frame_buffers(oci);
-            return ALLOC_FAILURE;
+            return 1;
        }
    }

@@ -88,13 +92,13 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
    }

    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
    {
        vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
    }

    oci->mb_rows = height >> 4;
@@ -106,21 +110,39 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
    if (!oci->mip)
    {
        vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
    }

    oci->mi = oci->mip + oci->mode_info_stride + 1;

+    /* allocate memory for last frame MODE_INFO array */
+#if CONFIG_ERROR_CONCEALMENT
+    oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+    if (!oci->prev_mip)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return 1;
+    }
+
+    oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
+#else
+    oci->prev_mip = NULL;
+    oci->prev_mi = NULL;
+#endif

    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);

    if (!oci->above_context)
    {
        vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
    }

-    vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+    update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+#if CONFIG_ERROR_CONCEALMENT
+    update_mode_info_border(oci->prev_mi, oci->mb_rows, oci->mb_cols);
+#endif

    return 0;
 }
@@ -130,32 +152,32 @@ void vp8_setup_version(VP8_COMMON *cm)
    {
    case 0:
        cm->no_lpf = 0;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
        cm->use_bilinear_mc_filter = 0;
        cm->full_pixel = 0;
        break;
    case 1:
        cm->no_lpf = 0;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
        cm->use_bilinear_mc_filter = 1;
        cm->full_pixel = 0;
        break;
    case 2:
        cm->no_lpf = 1;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
        cm->use_bilinear_mc_filter = 1;
        cm->full_pixel = 0;
        break;
    case 3:
        cm->no_lpf = 1;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
        cm->use_bilinear_mc_filter = 1;
        cm->full_pixel = 1;
        break;
    default:
        /*4,5,6,7 are reserved for future use*/
        cm->no_lpf = 0;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
        cm->use_bilinear_mc_filter = 0;
        cm->full_pixel = 0;
        break;
@@ -170,7 +192,7 @@ void vp8_create_common(VP8_COMMON *oci)

    oci->mb_no_coeff_skip = 1;
    oci->no_lpf = 0;
-    oci->simpler_lpf = 0;
+    oci->filter_type = NORMAL_LOOPFILTER;
    oci->use_bilinear_mc_filter = 0;
    oci->full_pixel = 0;
    oci->multi_token_partition = ONE_PARTITION;
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -11,35 +11,30 @@

 #include "vpx_ports/config.h"
 #include "vpx_ports/arm.h"
-#include "g_common.h"
-#include "pragmas.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
-
-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-
-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
+#include "vp8/common/g_common.h"
+#include "vp8/common/pragmas.h"
+#include "vp8/common/subpixel.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/onyxc_int.h"

 void vp8_arch_arm_common_init(VP8_COMMON *ctx)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
    VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
    int flags = arm_cpu_caps();
-    int has_edsp = flags & HAS_EDSP;
-    int has_media = flags & HAS_MEDIA;
-    int has_neon = flags & HAS_NEON;
    rtcd->flags = flags;

    /* Override default functions with fastest ones for this CPU. */
+#if HAVE_ARMV5TE
+    if (flags & HAS_EDSP)
+    {
+    }
+#endif
+
 #if HAVE_ARMV6
-    if (has_media)
+    if (flags & HAS_MEDIA)
    {
        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;
        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;
@@ -59,9 +54,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
        rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
        rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
        rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+        rtcd->loopfilter.simple_mb_v =
+                vp8_loop_filter_simple_vertical_edge_armv6;
        rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+        rtcd->loopfilter.simple_mb_h =
+                vp8_loop_filter_simple_horizontal_edge_armv6;
        rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;

        rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
@@ -74,7 +71,7 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
 #endif

 #if HAVE_ARMV7
-    if (has_neon)
+    if (flags & HAS_NEON)
    {
        rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;
        rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;
@@ -106,31 +103,12 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
        rtcd->recon.recon2      = vp8_recon2b_neon;
        rtcd->recon.recon4      = vp8_recon4b_neon;
        rtcd->recon.recon_mb    = vp8_recon_mb_neon;
-
-    }
-#endif
-
-#endif
-
-#if HAVE_ARMV6
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (has_media)
-#endif
-    {
-        vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-        vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
-    }
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (has_neon)
-#endif
-    {
-        vp8_build_intra_predictors_mby_ptr =
+        rtcd->recon.build_intra_predictors_mby =
            vp8_build_intra_predictors_mby_neon;
-        vp8_build_intra_predictors_mby_s_ptr =
+        rtcd->recon.build_intra_predictors_mby_s =
            vp8_build_intra_predictors_mby_s_neon;
    }
 #endif
+
+#endif
 }
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -16,10 +16,10 @@

 ;-------------------------------------
 ; r0    unsigned char  *src_ptr,
-; r1    unsigned short *output_ptr,
-; r2    unsigned int src_pixels_per_line,
-; r3    unsigned int output_height,
-; stack    unsigned int output_width,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
 ; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
@@ -27,21 +27,21 @@
    stmdb   sp!, {r4 - r11, lr}

    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width

    mov     r12, r3                         ; outer-loop counter
-    sub     r2, r2, r4                      ; src increment for height loop

-    ;;IF ARCHITECTURE=6
-    pld     [r0]
-    ;;ENDIF
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop

    ldr     r5, [r11]                       ; load up filter coefficients

-    mov     r3, r3, lsl #1                  ; output_height*2
+    mov     r3, r3, lsl #1                  ; height*2
    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)

-    mov     r11, r1                         ; save output_ptr for each row
+    mov     r11, r1                         ; save dst_ptr for each row

    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
    beq     bil_null_1st_filter
@@ -96,9 +96,8 @@
    add     r0, r0, r2                      ; move to next input row
    subs    r12, r12, #1

-    ;;IF ARCHITECTURE=6
-    pld     [r0]
-    ;;ENDIF
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row

    add     r11, r11, #2                    ; move over to next column
    mov     r1, r11
@@ -140,17 +139,17 @@

 ;---------------------------------
 ; r0    unsigned short *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    int output_pitch,
-; r3    unsigned int  output_height,
-; stack unsigned int  output_width,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
 ; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}

    ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width

    ldr     r5, [r11]                       ; load up filter coefficients
    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
--- a/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ b/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -22,9 +22,7 @@
    ;push   {r4-r7}

    ;preload
-    pld     [r0]
-    pld     [r0, r1]
-    pld     [r0, r1, lsl #1]
+    pld     [r0, #31]                ; preload for next 16x16 block

    ands    r4, r0, #15
    beq     copy_mem16x16_fast
@@ -90,6 +88,8 @@ copy_mem16x16_1_loop
    ldrneb  r6, [r0, #2]
    ldrneb  r7, [r0, #3]

+    pld     [r0, #31]               ; preload for next 16x16 block
+
    bne     copy_mem16x16_1_loop

    ldmia       sp!, {r4 - r7}
@@ -121,6 +121,8 @@ copy_mem16x16_4_loop
    ldrne   r6, [r0, #8]
    ldrne   r7, [r0, #12]

+    pld     [r0, #31]               ; preload for next 16x16 block
+
    bne     copy_mem16x16_4_loop

    ldmia       sp!, {r4 - r7}
@@ -148,6 +150,7 @@ copy_mem16x16_8_loop

    add     r2, r2, r3

+    pld     [r0, #31]               ; preload for next 16x16 block
    bne     copy_mem16x16_8_loop

    ldmia       sp!, {r4 - r7}
@@ -171,6 +174,7 @@ copy_mem16x16_fast_loop
    ;stm        r2, {r4-r7}
    add     r2, r2, r3

+    pld     [r0, #31]               ; preload for next 16x16 block
    bne     copy_mem16x16_fast_loop

    ldmia       sp!, {r4 - r7}
--- a/vp8/common/arm/armv6/filter_v6.asm
+++ b/vp8/common/arm/armv6/filter_v6.asm
@@ -10,6 +10,8 @@


    EXPORT  |vp8_filter_block2d_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6|
    EXPORT  |vp8_filter_block2d_second_pass_armv6|
    EXPORT  |vp8_filter4_block2d_second_pass_armv6|
    EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
@@ -40,11 +42,6 @@
    add     r12, r3, #16                    ; square off the output
    sub     sp, sp, #4

-    ;;IF ARCHITECTURE=6
-    ;pld        [r0, #-2]
-    ;;pld       [r0, #30]
-    ;;ENDIF
-
    ldr     r4, [r11]                       ; load up packed filter coefficients
    ldr     r5, [r11, #4]
    ldr     r6, [r11, #8]
@@ -101,15 +98,10 @@

    bne     width_loop_1st_6

-    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
-    ;;IF ARCHITECTURE=6
-    ;pld        [r0, r2]
-    ;;pld       [r0, r9]
-    ;;ENDIF
-
    ldr     r1, [sp]                        ; load and update dst address
    subs    r7, r7, #0x10000
    add     r0, r0, r2                      ; move to next input line
+
    add     r1, r1, #2                      ; move over to next column
    str     r1, [sp]

@@ -120,6 +112,192 @@

    ENDP

+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #18                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_16_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_16_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #34                    ; adding back block width(=16)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_16_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #10                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_8_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_8_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #18                    ; adding back block width(=8)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_8_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
 ;---------------------------------
 ; r0    short         *src_ptr,
 ; r1    unsigned char *output_ptr,
@@ -262,6 +440,10 @@
 |vp8_filter_block2d_first_pass_only_armv6| PROC
    stmdb   sp!, {r4 - r11, lr}

+    add     r7, r2, r3                      ; preload next low
+    add     r7, r7, #2
+    pld     [r0, r7]
+
    ldr     r4, [sp, #36]                   ; output pitch
    ldr     r11, [sp, #40]                  ; HFilter address
    sub     sp, sp, #8
@@ -330,16 +512,15 @@

    bne     width_loop_1st_only_6

-    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
-    ;;IF ARCHITECTURE=6
-    ;pld        [r0, r2]
-    ;;pld       [r0, r9]
-    ;;ENDIF
-
    ldr     lr, [sp]                        ; load back output pitch
    ldr     r12, [sp, #4]                   ; load back output pitch
    subs    r7, r7, #1
    add     r0, r0, r12                     ; updata src for next loop
+
+    add     r11, r12, r3                    ; preload next low
+    add     r11, r11, #2
+    pld     [r0, r11]
+
    add     r1, r1, lr                      ; update dst for next loop

    bne     height_loop_1st_only_6
--- a/vp8/common/arm/armv6/loopfilter_v6.asm
+++ b/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -53,14 +53,11 @@ count       RN  r5

 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
+;r2     const char *blimit,
 ;r3     const char *limit,
 ;stack  const char *thresh,
 ;stack  int  count

-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@@ -72,14 +69,18 @@ count       RN  r5
    sub         sp, sp, #16                 ; create temp buffer

    ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
    ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
    ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
    mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16

 |Hnext8|
    ; vp8_filter_mask() function
@@ -253,12 +254,6 @@ count       RN  r5

    subs        count, count, #1

-    ;pld            [src]
-    ;pld            [src, pstep]
-    ;pld            [src, pstep, lsl #1]
-    ;pld            [src, pstep, lsl #2]
-    ;pld            [src, pstep, lsl #3]
-
    ldrne       r9, [src], pstep            ; p3
    ldrne       r10, [src], pstep           ; p2
    ldrne       r11, [src], pstep           ; p1
@@ -281,14 +276,18 @@ count       RN  r5
    sub         sp, sp, #16                 ; create temp buffer

    ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
    ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
    ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
    mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16

 |MBHnext8|

@@ -590,15 +589,19 @@ count       RN  r5
    sub         sp, sp, #16                 ; create temp buffer

    ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
    ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
    ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
    ldr         lr, [src], pstep
    mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16

 |Vnext8|

@@ -857,18 +860,26 @@ count       RN  r5
    sub         src, src, #4                ; move src pointer down by 4
    ldr         count, [sp, #40]            ; count for 8-in-parallel
    ldr         r12, [sp, #36]              ; load thresh address
+    pld         [src, #23]                  ; preload for next block
    sub         sp, sp, #16                 ; create temp buffer

    ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
+    pld         [src, #23]
    ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
+    pld         [src, #23]
    ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    pld         [src, #23]
    ldr         lr, [src], pstep
    mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16

 |MBVnext8|
    ; vp8_filter_mask() function
@@ -908,6 +919,7 @@ count       RN  r5
    str         lr, [sp, #8]
    ldr         lr, [src], pstep

+
    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12

    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
@@ -956,6 +968,7 @@ count       RN  r5
    beq         mbvskip_filter               ; skip filtering


+
    ;vp8_hevmask() function
    ;calculate high edge variance

@@ -1123,6 +1136,7 @@ count       RN  r5
    smlabb      r8, r6, lr, r7
    smlatb      r6, r6, lr, r7
    smlabb      r9, r10, lr, r7
+
    smlatb      r10, r10, lr, r7
    ssat        r8, #8, r8, asr #7
    ssat        r6, #8, r6, asr #7
@@ -1242,9 +1256,13 @@ count       RN  r5
    sub         src, src, #4
    subs        count, count, #1

+    pld         [src, #23]                  ; preload for next block
    ldrne       r6, [src], pstep            ; load source data
+    pld         [src, #23]
    ldrne       r7, [src], pstep
+    pld         [src, #23]
    ldrne       r8, [src], pstep
+    pld         [src, #23]
    ldrne       lr, [src], pstep

    bne         MBVnext8
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -45,35 +45,28 @@
    MEND


+
 src         RN  r0
 pstep       RN  r1

 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
-;r3     const char *limit,
-;stack  const char *thresh,
-;stack  int  count
-
-; All 16 elements in flimit are equal. So, in the code, only one load is needed
-; for flimit. Same applies to limit. thresh is not used in simple looopfilter
+;r2     const char *blimit

 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    stmdb       sp!, {r4 - r11, lr}

-    ldr         r12, [r3]                   ; limit
+    ldrb        r12, [r2]                   ; blimit
    ldr         r3, [src, -pstep, lsl #1]   ; p1
    ldr         r4, [src, -pstep]           ; p0
    ldr         r5, [src]                   ; q0
    ldr         r6, [src, pstep]            ; q1
-    ldr         r7, [r2]                    ; flimit
+    orr         r12, r12, r12, lsl #8       ; blimit
    ldr         r2, c0x80808080
-    ldr         r9, [sp, #40]               ; count for 8-in-parallel
-    uadd8       r7, r7, r7                  ; flimit * 2
-    mov         r9, r9, lsl #1              ; double the count. we're doing 4 at a time
-    uadd8       r12, r7, r12                ; flimit * 2 + limit
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
    mov         lr, #0                      ; need 0 in a couple places

 |simple_hnext8|
@@ -148,30 +141,32 @@ pstep       RN  r1
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    stmdb       sp!, {r4 - r11, lr}

-    ldr         r12, [r2]                   ; r12: flimit
+    ldrb        r12, [r2]                   ; r12: blimit
    ldr         r2, c0x80808080
-    ldr         r7, [r3]                    ; limit
+    orr         r12, r12, r12, lsl #8

    ; load soure data to r7, r8, r9, r10
    ldrh        r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
    ldrh        r4, [src], pstep
-    uadd8       r12, r12, r12               ; flimit * 2
+    orr         r12, r12, r12, lsl #16

    ldrh        r5, [src, #-2]
+    pld         [src, #23]
    ldrh        r6, [src], pstep
-    uadd8       r12, r12, r7                ; flimit * 2 + limit

    pkhbt       r7, r3, r4, lsl #16

    ldrh        r3, [src, #-2]
+    pld         [src, #23]
    ldrh        r4, [src], pstep
-    ldr         r11, [sp, #40]              ; count (r11) for 8-in-parallel

    pkhbt       r8, r5, r6, lsl #16

    ldrh        r5, [src, #-2]
+    pld         [src, #23]
    ldrh        r6, [src], pstep
-    mov         r11, r11, lsl #1            ; 4-in-parallel
+    mov         r11, #4                     ; double the count. we're doing 4 at a time

 |simple_vnext8|
    ; vp8_simple_filter_mask() function
@@ -259,19 +254,23 @@ pstep       RN  r1

    ; load soure data to r7, r8, r9, r10
    ldrneh      r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
    ldrneh      r4, [src], pstep

    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
    ldrneh      r6, [src], pstep

    pkhbt       r7, r3, r4, lsl #16

    ldrneh      r3, [src, #-2]
+    pld         [src, #23]
    ldrneh      r4, [src], pstep

    pkhbt       r8, r5, r6, lsl #16

    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
    ldrneh      r6, [src], pstep

    bne         simple_vnext8
--- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -32,9 +32,12 @@
    beq         skip_firstpass_filter

 ;first-pass filter
-    ldr         r12, _filter8_coeff_
+    adr         r12, filter8_coeff
    sub         r0, r0, r1, lsl #1

+    add         r3, r1, #10                 ; preload next low
+    pld         [r0, r3]
+
    add         r2, r12, r2, lsl #4         ;calculate filter location
    add         r0, r0, #3                  ;adjust src only for loading convinience

@@ -110,6 +113,9 @@

    add         r0, r0, r1                  ; move to next input line

+    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
+    pld         [r0, r11]
+
    bne         first_pass_hloop_v6

 ;second pass filter
@@ -121,7 +127,7 @@ secondpass_filter
    cmp         r3, #0
    beq         skip_secondpass_filter

-    ldr         r12, _filter8_coeff_
+    adr         r12, filter8_coeff
    add         lr, r12, r3, lsl #4         ;calculate filter location

    mov         r2, #0x00080000
@@ -243,12 +249,8 @@ skip_secondpass_hloop
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_filter8_coeff_
-    DCD     filter8_coeff
 filter8_coeff
    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c
@@ -10,112 +10,15 @@


 #include <math.h>
-#include "subpixel.h"
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-static const short bilinear_filters[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
-
-
-extern void vp8_filter_block2d_bil_first_pass_armv6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-);
-
-#if 0
-void vp8_filter_block2d_bil_first_pass_6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
-                               ((int)src_ptr[1] * vp8_filter[1]) +
-                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-void vp8_filter_block2d_bil_second_pass_6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i,j;
-    int  Temp;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply filter */
-            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
-                    ((int)src_ptr[output_width] * vp8_filter[1]) +
-                    (VP8_FILTER_WEIGHT/2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        /*src_ptr    += src_pixels_per_line - output_width;*/
-        output_ptr += output_pitch;
-    }
-}
-#endif
+#include "vp8/common/filter.h"
+#include "vp8/common/subpixel.h"
+#include "bilinearfilter_arm.h"

 void vp8_filter_block2d_bil_armv6
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
    const short   *HFilter,
    const short   *VFilter,
@@ -123,15 +26,13 @@ void vp8_filter_block2d_bil_armv6
    int            Height
 )
 {
-
-    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    /* pixel_step = 1; */
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -148,8 +49,8 @@ void vp8_bilinear_predict4x4_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
@@ -167,8 +68,8 @@ void vp8_bilinear_predict8x8_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
@@ -186,8 +87,8 @@ void vp8_bilinear_predict8x4_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
@@ -205,8 +106,8 @@ void vp8_bilinear_predict16x16_armv6
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/arm/bilinearfilter_arm.h
+++ b/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c
@@ -11,26 +11,10 @@

 #include "vpx_ports/config.h"
 #include <math.h>
-#include "subpixel.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/subpixel.h"
 #include "vpx_ports/mem.h"

-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
-{
-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
-    { 0, -1,   12,  123,  -6,  0 },
-};
-
-
 extern void vp8_filter_block2d_first_pass_armv6
 (
    unsigned char *src_ptr,
@@ -41,6 +25,28 @@ extern void vp8_filter_block2d_first_pass_armv6
    const short *vp8_filter
 );

+// 8x8
+extern void vp8_filter_block2d_first_pass_8x8_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+// 16x16
+extern void vp8_filter_block2d_first_pass_16x16_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
 extern void vp8_filter_block2d_second_pass_armv6
 (
    short         *src_ptr,
@@ -93,11 +99,11 @@ void vp8_sixtap_predict_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */


-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* Vfilter is null. First pass only */
    if (xoffset && !yoffset)
@@ -129,47 +135,6 @@ void vp8_sixtap_predict_armv6
    }
 }

-#if 0
-void vp8_sixtap_predict8x4_armv6
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
-
-
-    /*if (xoffset && !yoffset)
-    {
-        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
-    }*/
-    /* Hfilter is null. Second pass only */
-    /*else if (!xoffset && yoffset)
-    {
-        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
-    }
-    else
-    {
-        if (yoffset & 0x1)
-            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
-        else*/
-
-        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
-
-        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
-    /*}*/
-}
-#endif
-
 void vp8_sixtap_predict8x8_armv6
 (
    unsigned char  *src_ptr,
@@ -182,10 +147,10 @@ void vp8_sixtap_predict8x8_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    if (xoffset && !yoffset)
    {
@@ -200,12 +165,12 @@ void vp8_sixtap_predict8x8_armv6
    {
        if (yoffset & 0x1)
        {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
        }
        else
        {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
        }
    }
@@ -224,10 +189,10 @@ void vp8_sixtap_predict16x16_armv6
 {
    const short  *HFilter;
    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    if (xoffset && !yoffset)
    {
@@ -242,12 +207,12 @@ void vp8_sixtap_predict16x16_armv6
    {
        if (yoffset & 0x1)
        {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
        }
        else
        {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
        }
    }
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -9,135 +9,107 @@
 */


-#include "vpx_ports/config.h"
-#include <math.h>
-#include "loopfilter.h"
-#include "onyxc_int.h"
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"

+#if HAVE_ARMV6
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
+#endif

-extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
+#if HAVE_ARMV7
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh,
+        unsigned char *v);

-extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;

+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif

 #if HAVE_ARMV6
 /*ARMV6 loopfilter functions*/
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }

 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }

 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }

-void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
 }

 /* Vertical B Filtering */
 void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }

-void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
 }
 #endif

@@ -145,93 +117,60 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
-}
-
-void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }

 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
-}
-
-void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }

 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
-}
-
-void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
 }

 /* Vertical B Filtering */
 void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
-}
-
-void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
 }
 #endif
--- a/vp8/common/arm/loopfilter_arm.h
+++ b/vp8/common/arm/loopfilter_arm.h
@@ -12,15 +12,17 @@
 #ifndef LOOPFILTER_ARM_H
 #define LOOPFILTER_ARM_H

+#include "vpx_config.h"
+
 #if HAVE_ARMV6
 extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
@@ -36,28 +38,29 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6

 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6

 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6

 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6

 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */

 #if HAVE_ARMV7
 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);

 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
@@ -83,7 +86,8 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);

 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */

-#endif
+#endif /* HAVE_ARMV7 */
+
+#endif /* LOOPFILTER_ARM_H */
--- a/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -25,7 +25,7 @@
 |vp8_bilinear_predict16x16_neon| PROC
    push            {r4-r5, lr}

-    ldr             r12, _bifilter16_coeff_
+    adr             r12, bifilter16_coeff
    ldr             r4, [sp, #12]           ;load parameters from stack
    ldr             r5, [sp, #16]           ;load parameters from stack

@@ -350,12 +350,7 @@ filt_blk2d_spo16x16_loop_neon
    ENDP

 ;-----------------
-    AREA    bifilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_bifilter16_coeff_
-    DCD     bifilter16_coeff
+
 bifilter16_coeff
    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

--- a/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -25,7 +25,7 @@
 |vp8_bilinear_predict4x4_neon| PROC
    push            {r4, lr}

-    ldr             r12, _bifilter4_coeff_
+    adr             r12, bifilter4_coeff
    ldr             r4, [sp, #8]            ;load parameters from stack
    ldr             lr, [sp, #12]           ;load parameters from stack

@@ -123,12 +123,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bilinearfilters4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_bifilter4_coeff_
-    DCD     bifilter4_coeff
+
 bifilter4_coeff
    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

--- a/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -25,7 +25,7 @@
 |vp8_bilinear_predict8x4_neon| PROC
    push            {r4, lr}

-    ldr             r12, _bifilter8x4_coeff_
+    adr             r12, bifilter8x4_coeff
    ldr             r4, [sp, #8]            ;load parameters from stack
    ldr             lr, [sp, #12]           ;load parameters from stack

@@ -128,12 +128,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bifilters8x4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_bifilter8x4_coeff_
-    DCD     bifilter8x4_coeff
+
 bifilter8x4_coeff
    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

--- a/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ b/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -25,7 +25,7 @@
 |vp8_bilinear_predict8x8_neon| PROC
    push            {r4, lr}

-    ldr             r12, _bifilter8_coeff_
+    adr             r12, bifilter8_coeff
    ldr             r4, [sp, #8]            ;load parameters from stack
    ldr             lr, [sp, #12]           ;load parameters from stack

@@ -176,12 +176,7 @@ skip_secondpass_filter
    ENDP

 ;-----------------
-    AREA    bifilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_bifilter8_coeff_
-    DCD     bifilter8_coeff
+
 bifilter8_coeff
    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

--- a/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/vp8/common/arm/neon/iwalsh_neon.asm
@@ -20,19 +20,16 @@
 |vp8_short_inv_walsh4x4_neon| PROC

    ; read in all four lines of values: d0->d3
-    vldm.64 r0, {q0, q1}
+    vld1.i16 {q0-q1}, [r0@128]

    ; first for loop
-
    vadd.s16 d4, d0, d3 ;a = [0] + [12]
-    vadd.s16 d5, d1, d2 ;b = [4] + [8]
-    vsub.s16 d6, d1, d2 ;c = [4] - [8]
-    vsub.s16 d7, d0, d3 ;d = [0] - [12]
+    vadd.s16 d6, d1, d2 ;b = [4] + [8]
+    vsub.s16 d5, d0, d3 ;d = [0] - [12]
+    vsub.s16 d7, d1, d2 ;c = [4] - [8]

-    vadd.s16 d0, d4, d5 ;a + b
-    vadd.s16 d1, d6, d7 ;c + d
-    vsub.s16 d2, d4, d5 ;a - b
-    vsub.s16 d3, d7, d6 ;d - c
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c

    vtrn.32 d0, d2 ;d0:  0  1  8  9
                   ;d2:  2  3 10 11
@@ -47,29 +44,22 @@
    ; second for loop

    vadd.s16 d4, d0, d3 ;a = [0] + [3]
-    vadd.s16 d5, d1, d2 ;b = [1] + [2]
-    vsub.s16 d6, d1, d2 ;c = [1] - [2]
-    vsub.s16 d7, d0, d3 ;d = [0] - [3]
+    vadd.s16 d6, d1, d2 ;b = [1] + [2]
+    vsub.s16 d5, d0, d3 ;d = [0] - [3]
+    vsub.s16 d7, d1, d2 ;c = [1] - [2]

-    vadd.s16 d0, d4, d5 ;e = a + b
-    vadd.s16 d1, d6, d7 ;f = c + d
-    vsub.s16 d2, d4, d5 ;g = a - b
-    vsub.s16 d3, d7, d6 ;h = d - c
+    vmov.i16 q8, #3

-    vmov.i16 q2, #3
-    vadd.i16 q0, q0, q2 ;e/f += 3
-    vadd.i16 q1, q1, q2 ;g/h += 3
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
+
+    vadd.i16 q0, q0, q8 ;e/f += 3
+    vadd.i16 q1, q1, q8 ;g/h += 3

    vshr.s16 q0, q0, #3 ;e/f >> 3
    vshr.s16 q1, q1, #3 ;g/h >> 3

-    vtrn.32 d0, d2
-    vtrn.32 d1, d3
-    vtrn.16 d0, d1
-    vtrn.16 d2, d3
-
-    vstmia.16 r1!, {q0}
-    vstmia.16 r1!, {q1}
+    vst4.i16 {d0,d1,d2,d3}, [r1@128]

    bx lr
    ENDP    ; |vp8_short_inv_walsh4x4_neon|
@@ -77,19 +67,13 @@

 ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
 |vp8_short_inv_walsh4x4_1_neon| PROC
-    ; load a full line into a neon register
-    vld1.16  {q0}, [r0]
-    ; extract first element and replicate
-    vdup.16 q1, d0[0]
-    ; add 3 to all values
-    vmov.i16 q2, #3
-    vadd.i16 q3, q1, q2
-    ; right shift
-    vshr.s16 q3, q3, #3
-    ; write it back
-    vstmia.16 r1!, {q3}
-    vstmia.16 r1!, {q3}
-
+    ldrsh r2, [r0]          ; load input[0]
+    add r3, r2, #3          ; add 3
+    add r2, r1, #16         ; base for last 8 output
+    asr r0, r3, #3          ; right shift 3
+    vdup.16 q0, r0          ; load and duplicate
+    vst1.16 {q0}, [r1@128]  ; write back 8
+    vst1.16 {q0}, [r2@128]  ; write back last 8
    bx lr
    ENDP    ; |vp8_short_inv_walsh4x4_1_neon|

--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -14,109 +14,97 @@
    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
    ARM
-    REQUIRE8
-    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
-; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                             const signed char *flimit,
-;                                             const signed char *limit,
-;                                             const signed char *thresh,
-;                                             int count)
 ; r0    unsigned char *src
 ; r1    int pitch
-; r2    const signed char *flimit
-; r3    const signed char *limit
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_loop_filter_horizontal_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1
+    add         r1, r1, r1

-    vld1.u8     {q3}, [r2], r1              ; p3
-    vld1.u8     {q4}, [r2], r1              ; p2
-    vld1.u8     {q5}, [r2], r1              ; p1
-    vld1.u8     {q6}, [r2], r1              ; p0
-    vld1.u8     {q7}, [r2], r1              ; q0
-    vld1.u8     {q8}, [r2], r1              ; q1
-    vld1.u8     {q9}, [r2], r1              ; q2
-    vld1.u8     {q10}, [r2]                 ; q3
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    sub         r0, r0, r1, lsl #1
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vld1.u8     {q3}, [r2@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r2@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r2@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r2@128]                  ; q2
+    vld1.u8     {q10}, [r12@128]                ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r12, r12, r1, lsl #1

    bl          vp8_loop_filter_neon

-    vst1.u8     {q5}, [r0], r1              ; store op1
-    vst1.u8     {q6}, [r0], r1              ; store op0
-    vst1.u8     {q7}, [r0], r1              ; store oq0
-    vst1.u8     {q8}, [r0], r1              ; store oq1
+    vst1.u8     {q5}, [r2@128], r1              ; store op1
+    vst1.u8     {q6}, [r12@128], r1             ; store op0
+    vst1.u8     {q7}, [r2@128], r1              ; store oq0
+    vst1.u8     {q8}, [r12@128], r1             ; store oq1

-    ldmia       sp!, {pc}
+    pop         {pc}
    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|

-; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
-;                                              const signed char *flimit,
-;                                              const signed char *limit,
-;                                              const signed char *thresh,
-;                                              unsigned char *v)
+
 ; r0    unsigned char *u,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 ; sp+4  unsigned char *v
 |vp8_loop_filter_horizontal_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    vdup.u8     q1, r3                      ; duplicate limit
+    ldr         r12, [sp, #4]               ; load thresh
    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q2, r12                     ; duplicate thresh

    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    vld1.u8     {d6}, [r3], r1              ; p3
-    vld1.u8     {d8}, [r3], r1              ; p2
-    vld1.u8     {d10}, [r3], r1             ; p1
-    vld1.u8     {d12}, [r3], r1             ; p0
-    vld1.u8     {d14}, [r3], r1             ; q0
-    vld1.u8     {d16}, [r3], r1             ; q1
-    vld1.u8     {d18}, [r3], r1             ; q2
-    vld1.u8     {d20}, [r3]                 ; q3
-
-    ldr         r3, [sp, #4]                ; load thresh pointer
-
    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
-    vld1.u8     {d7}, [r12], r1             ; p3
-    vld1.u8     {d9}, [r12], r1             ; p2
-    vld1.u8     {d11}, [r12], r1            ; p1
-    vld1.u8     {d13}, [r12], r1            ; p0
-    vld1.u8     {d15}, [r12], r1            ; q0
-    vld1.u8     {d17}, [r12], r1            ; q1
-    vld1.u8     {d19}, [r12], r1            ; q2
-    vld1.u8     {d21}, [r12]                ; q3

-    vld1.s8     {d4[], d5[]}, [r3]          ; thresh
+    vld1.u8     {d6}, [r3@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1             ; p3
+    vld1.u8     {d8}, [r3@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1             ; p2
+    vld1.u8     {d10}, [r3@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1            ; p1
+    vld1.u8     {d12}, [r3@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1            ; p0
+    vld1.u8     {d14}, [r3@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1            ; q0
+    vld1.u8     {d16}, [r3@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1            ; q1
+    vld1.u8     {d18}, [r3@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1            ; q2
+    vld1.u8     {d20}, [r3@64]                 ; q3
+    vld1.u8     {d21}, [r12@64]                ; q3

    bl          vp8_loop_filter_neon

    sub         r0, r0, r1, lsl #1
    sub         r2, r2, r1, lsl #1

-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r2], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r2], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r2], r1             ; store v oq0
-    vst1.u8     {d16}, [r0]                 ; store u oq1
-    vst1.u8     {d17}, [r2]                 ; store v oq1
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r2@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r2@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64]                 ; store u oq1
+    vst1.u8     {d17}, [r2@64]                 ; store v oq1

-    ldmia       sp!, {pc}
+    pop         {pc}
    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|

 ; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
@@ -124,39 +112,38 @@
 ;                                           const signed char *limit,
 ;                                           const signed char *thresh,
 ;                                           int count)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+
 |vp8_loop_filter_vertical_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
    sub         r2, r0, #4                 ; src ptr down by 4 columns
-    sub         r0, r0, #2                  ; dst ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    add         r1, r1, r1
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1, asr #1

-    vld1.u8     {d6}, [r2], r1              ; load first 8-line src data
-    vld1.u8     {d8}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d8}, [r12], r1
    vld1.u8     {d10}, [r2], r1
-    vld1.u8     {d12}, [r2], r1
+    vld1.u8     {d12}, [r12], r1
    vld1.u8     {d14}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d16}, [r12], r1
    vld1.u8     {d18}, [r2], r1
-    vld1.u8     {d20}, [r2], r1
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {d20}, [r12], r1

    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r2], r1
+    vld1.u8     {d9}, [r12], r1
    vld1.u8     {d11}, [r2], r1
-    vld1.u8     {d13}, [r2], r1
+    vld1.u8     {d13}, [r12], r1
    vld1.u8     {d15}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d19}, [r2], r1
-    vld1.u8     {d21}, [r2]
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d19}, [r2]
+    vld1.u8     {d21}, [r12]

    ;transpose to 8x16 matrix
    vtrn.32     q3, q7
@@ -164,6 +151,8 @@
    vtrn.32     q5, q9
    vtrn.32     q6, q10

+    vdup.u8     q2, r3                     ; duplicate thresh
+
    vtrn.16     q3, q5
    vtrn.16     q4, q6
    vtrn.16     q7, q9
@@ -178,28 +167,34 @@

    vswp        d12, d11
    vswp        d16, d13
+
+    sub         r0, r0, #2                 ; dst ptr
+
    vswp        d14, d12
    vswp        d16, d15

+    add         r12, r0, r1, asr #1
+
    ;store op1, op0, oq0, oq1
    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r0]
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1

-    ldmia       sp!, {pc}
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+    pop         {pc}
    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|

 ; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
@@ -209,38 +204,36 @@
 ;                                            unsigned char *v)
 ; r0    unsigned char *u,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 ; sp+4  unsigned char *v
 |vp8_loop_filter_vertical_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
    sub         r12, r0, #4                 ; move u pointer down by 4 columns
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-
    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q1, r3                      ; duplicate limit
+    sub         r3, r2, #4                  ; move v pointer down by 4 columns

    vld1.u8     {d6}, [r12], r1             ;load u data
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d10}, [r12], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d14}, [r12], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d18}, [r12], r1
-    vld1.u8     {d20}, [r12]
-
-    sub         r3, r2, #4                  ; move v pointer down by 4 columns
    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r12], r1
    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r12], r1
    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r12], r1
    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r12], r1
    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r12], r1
    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r12], r1
    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r12]
    vld1.u8     {d21}, [r3]

-    ldr         r12, [sp, #4]               ; load thresh pointer
+    ldr        r12, [sp, #4]               ; load thresh

    ;transpose to 8x16 matrix
    vtrn.32     q3, q7
@@ -248,6 +241,8 @@
    vtrn.32     q5, q9
    vtrn.32     q6, q10

+    vdup.u8     q2, r12                     ; duplicate thresh
+
    vtrn.16     q3, q5
    vtrn.16     q4, q6
    vtrn.16     q7, q9
@@ -258,18 +253,16 @@
    vtrn.8      q7, q8
    vtrn.8      q9, q10

-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-
    bl          vp8_loop_filter_neon

-    sub         r0, r0, #2
-    sub         r2, r2, #2
-
    vswp        d12, d11
    vswp        d16, d13
    vswp        d14, d12
    vswp        d16, d15

+    sub         r0, r0, #2
+    sub         r2, r2, #2
+
    ;store op1, op0, oq0, oq1
    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
@@ -288,7 +281,7 @@
    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]

-    ldmia       sp!, {pc}
+    pop         {pc}
    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|

 ; void vp8_loop_filter_neon();
@@ -308,7 +301,6 @@
 ; q9    q2
 ; q10   q3
 |vp8_loop_filter_neon| PROC
-    ldr         r12, _lf_coeff_

    ; vp8_filter_mask
    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
@@ -317,42 +309,44 @@
    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)

    vmax.u8     q11, q11, q12
    vmax.u8     q12, q13, q14
    vmax.u8     q3, q3, q4
    vmax.u8     q15, q11, q12

+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
    ; vp8_hevmask
    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
    vmax.u8     q15, q15, q3

-    vadd.u8     q0, q0, q0                  ; flimit * 2
-    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
-    vcge.u8     q15, q1, q15
+    vmov.u8     q10, #0x80                   ; 0x80

    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1

-    vld1.u8     {q0}, [r12]!
+    vcge.u8     q15, q1, q15

    ; vp8_filter() function
    ; convert to signed
-    veor        q7, q7, q0                  ; qs0
-    veor        q6, q6, q0                  ; ps0
-    veor        q5, q5, q0                  ; ps1
-    veor        q8, q8, q0                  ; qs1
+    veor        q7, q7, q10                 ; qs0
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0

-    vld1.u8     {q10}, [r12]!
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u8     q10, #3                     ; #3

    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
    vsubl.s8    q11, d15, d13

+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
+
    vmovl.u8    q4, d20

    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
@@ -367,7 +361,7 @@
    vaddw.s8    q2, q2, d2
    vaddw.s8    q11, q11, d3

-    vld1.u8     {q9}, [r12]!
+    vmov.u8     q9, #4                      ; #4

    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
    vqmovn.s16  d2, q2
@@ -379,31 +373,25 @@
    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3

+
    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)

    ; outer tap adjustments: ++vp8_filter >> 1
    vrshr.s8    q1, q1, #1
    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-
+    vmov.u8     q0, #0x80                   ; 0x80
    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)

-    veor        q5, q13, q0                 ; *op1 = u^0x80
    veor        q6, q11, q0                 ; *op0 = u^0x80
    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
    veor        q8, q12, q0                 ; *oq1 = u^0x80

    bx          lr
    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|

-    AREA    loopfilter_dat, DATA, READONLY
-_lf_coeff_
-    DCD     lf_coeff
-lf_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
+;-----------------

    END
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -9,110 +9,109 @@
 ;


-    EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    EXPORT  |vp8_loop_filter_bhs_neon|
+    EXPORT  |vp8_loop_filter_mbhs_neon|
    ARM
-    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE

 |vp8_loop_filter_simple_horizontal_edge_neon| PROC
-    sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines

-    ldr         r12, _lfhy_coeff_
-    vld1.u8     {q5}, [r0], r1              ; p1
-    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
-    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
-    vld1.u8     {q6}, [r0], r1              ; p0
-    vld1.u8     {q0}, [r12]!                ; 0x80
-    vld1.u8     {q7}, [r0], r1              ; q0
-    vld1.u8     {q10}, [r12]!               ; 0x03
-    vld1.u8     {q8}, [r0]                  ; q1
+    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
+
+    vld1.u8     {q7}, [r0@128], r1          ; q0
+    vld1.u8     {q5}, [r3@128], r1          ; p0
+    vld1.u8     {q8}, [r0@128]              ; q1
+    vld1.u8     {q6}, [r3@128]              ; p1

-    ;vp8_filter_mask() function
    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
+
    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q13, #3
    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

-    ;vp8_filter() function
    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value

-    vadd.u8     q1, q1, q1                  ; flimit * 2
-    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
-    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1

-;;;;;;;;;;
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
    vsubl.s8    q3, d15, d13

    vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)

-    ;vmul.i8    q2, q2, q10                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q12, q3, q3
+    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q3, q3, q13

-    vld1.u8     {q9}, [r12]!                ; 0x04
-
-    vadd.s16    q2, q2, q11
-    vadd.s16    q3, q3, q12
+    vmov.u8     q10, #0x03                  ; 0x03
+    vmov.u8     q9, #0x04                   ; 0x04

    vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
    vaddw.s8    q3, q3, d9

-    ;vqadd.s8   q4, q4, q2                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
    vqmovn.s16  d9, q3
-;;;;;;;;;;;;;

-    vand        q4, q4, q15                 ; vp8_filter &= mask
+    vand        q14, q4, q15                ; vp8_filter &= mask

-    vqadd.s8    q2, q4, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q4, q4, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vqadd.s8    q2, q14, q10                ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q9                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q4, q4, #3                  ; Filter1 >>= 3
+    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3

-    sub         r0, r0, r1, lsl #1
+    sub         r0, r0, r1

    ;calculate output
    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
    vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)

-    add         r3, r0, r1
-
    veor        q6, q11, q0                 ; *op0 = u^0x80
    veor        q7, q10, q0                 ; *oq0 = u^0x80

-    vst1.u8     {q6}, [r0]                  ; store op0
-    vst1.u8     {q7}, [r3]                  ; store oq0
+    vst1.u8     {q6}, [r3@128]              ; store op0
+    vst1.u8     {q7}, [r0@128]              ; store oq0

    bx          lr
    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|

-;-----------------
-    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_lfhy_coeff_
-    DCD     lfhy_coeff
-lfhy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_bhs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                    ; load blim from mem
+    vdup.s8     q1, r3                      ; duplicate blim
+
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbhs_neon| PROC
+    ldrb        r3, [r2]                   ; load blim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|

    END
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -9,60 +9,54 @@
 ;


-    EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    ;EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    EXPORT |vp8_loop_filter_bvs_neon|
+    EXPORT |vp8_loop_filter_mbvs_neon|
    ARM
-    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE

 |vp8_loop_filter_simple_vertical_edge_neon| PROC
    sub         r0, r0, #2                  ; move src pointer down by 2 columns
+    add         r12, r1, r1
+    add         r3, r0, r1

-    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
-    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
-    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
-    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
-    ldr         r12, _vlfy_coeff_
-    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
-    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
-    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
-    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
-    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
-    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
+    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12

-    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vld1.u8     {q0}, [r12]!                ; 0x80
-    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vld1.u8     {q11}, [r12]!               ; 0x03
-    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vld1.u8     {q12}, [r12]!               ; 0x04
-    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]

    vswp        d7, d10
    vswp        d12, d9
-    ;vswp       q4, q5                      ; p1:q3, p0:q5, q0:q4, q1:q6

    ;vp8_filter_mask() function
    ;vp8_hevmask() function
    sub         r0, r0, r1, lsl #4
    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
+
    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q11, #3
    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2

    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
@@ -70,90 +64,91 @@
    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value

-    vadd.u8     q1, q1, q1                  ; flimit * 2
-    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1

-    ;vp8_filter() function
-;;;;;;;;;;
-    ;vqsub.s8   q2, q5, q4                  ; ( qs0 - ps0)
    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
    vsubl.s8    q13, d9, d11

-    vqsub.s8    q1, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+    vqsub.s8    q14, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)

-    ;vmul.i8    q2, q2, q11                 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vadd.s16    q10, q2, q2                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q14, q13, q13
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q14
+    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q13, q13, q11

-    ;vqadd.s8   q1, q1, q2
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
+    vmov.u8     q11, #0x03                  ; 0x03
+    vmov.u8     q12, #0x04                  ; 0x04

-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q13
+    vaddw.s8    q2, q2, d28                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d29
+
+    vqmovn.s16  d28, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d29, q13

    add         r0, r0, #1
-    add         r2, r0, r1
-;;;;;;;;;;;
+    add         r3, r0, r1

-    vand        q1, q1, q15                 ; vp8_filter &= mask
+    vand        q14, q14, q15                 ; vp8_filter &= mask

-    vqadd.s8    q2, q1, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vqadd.s8    q2, q14, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3

    ;calculate output
-    vqsub.s8    q10, q4, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
    vqadd.s8    q11, q5, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q4, q14                 ; u = vp8_signed_char_clamp(qs0 - Filter1)

-    veor        q7, q10, q0                 ; *oq0 = u^0x80
    veor        q6, q11, q0                 ; *op0 = u^0x80
-
-    add         r3, r2, r1
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    add         r12, r1, r1
    vswp        d13, d14
-    add         r12, r3, r1

    ;store op1, op0, oq0, oq1
-    vst2.8      {d12[0], d13[0]}, [r0]
-    vst2.8      {d12[1], d13[1]}, [r2]
-    vst2.8      {d12[2], d13[2]}, [r3]
-    vst2.8      {d12[3], d13[3]}, [r12], r1
-    add         r0, r12, r1
-    vst2.8      {d12[4], d13[4]}, [r12]
-    vst2.8      {d12[5], d13[5]}, [r0], r1
-    add         r2, r0, r1
-    vst2.8      {d12[6], d13[6]}, [r0]
-    vst2.8      {d12[7], d13[7]}, [r2], r1
-    add         r3, r2, r1
-    vst2.8      {d14[0], d15[0]}, [r2]
-    vst2.8      {d14[1], d15[1]}, [r3], r1
-    add         r12, r3, r1
-    vst2.8      {d14[2], d15[2]}, [r3]
-    vst2.8      {d14[3], d15[3]}, [r12], r1
-    add         r0, r12, r1
-    vst2.8      {d14[4], d15[4]}, [r12]
-    vst2.8      {d14[5], d15[5]}, [r0], r1
-    add         r2, r0, r1
-    vst2.8      {d14[6], d15[6]}, [r0]
-    vst2.8      {d14[7], d15[7]}, [r2]
+    vst2.8      {d12[0], d13[0]}, [r0], r12
+    vst2.8      {d12[1], d13[1]}, [r3], r12
+    vst2.8      {d12[2], d13[2]}, [r0], r12
+    vst2.8      {d12[3], d13[3]}, [r3], r12
+    vst2.8      {d12[4], d13[4]}, [r0], r12
+    vst2.8      {d12[5], d13[5]}, [r3], r12
+    vst2.8      {d12[6], d13[6]}, [r0], r12
+    vst2.8      {d12[7], d13[7]}, [r3], r12
+    vst2.8      {d14[0], d15[0]}, [r0], r12
+    vst2.8      {d14[1], d15[1]}, [r3], r12
+    vst2.8      {d14[2], d15[2]}, [r0], r12
+    vst2.8      {d14[3], d15[3]}, [r3], r12
+    vst2.8      {d14[4], d15[4]}, [r0], r12
+    vst2.8      {d14[5], d15[5]}, [r3], r12
+    vst2.8      {d14[6], d15[6]}, [r0], r12
+    vst2.8      {d14[7], d15[7]}, [r3]

    bx          lr
    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|

-;-----------------
-    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_vlfy_coeff_
-    DCD     vlfy_coeff
-vlfy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit

+|vp8_loop_filter_bvs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                   ; load blim from mem
+    mov         r4, r0
+    add         r0, r0, #4
+    vdup.s8     q1, r3                     ; duplicate blim
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    ; vp8_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
+    add         r0, r4, #8
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    add         r0, r4, #12
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbvs_neon| PROC
+    ldrb        r3, [r2]                   ; load mblim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
    END
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -14,155 +14,143 @@
    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
    ARM
-    REQUIRE8
-    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
 ; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                               const signed char *flimit,
-;                                               const signed char *limit,
-;                                               const signed char *thresh,
-;                                               int count)
+;                                               const unsigned char *blimit,
+;                                               const unsigned char *limit,
+;                                               const unsigned char *thresh)
 ; r0    unsigned char *src,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_mbloop_filter_horizontal_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    push        {lr}
+    add         r1, r1, r1                  ; double stride
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
+    vdup.u8     q2, r12                     ; thresh
+    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line

-    vld1.u8     {q3}, [r0], r1              ; p3
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.u8     {q4}, [r0], r1              ; p2
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.u8     {q5}, [r0], r1              ; p1
-    vld1.u8     {q6}, [r0], r1              ; p0
-    vld1.u8     {q7}, [r0], r1              ; q0
-    vld1.u8     {q8}, [r0], r1              ; q1
-    vld1.u8     {q9}, [r0], r1              ; q2
-    vld1.u8     {q10}, [r0], r1             ; q3
+    vld1.u8     {q3}, [r0@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r0@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r0@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r0@128], r1              ; q2
+    vld1.u8     {q10}, [r12@128], r1            ; q3

    bl          vp8_mbloop_filter_neon

-    sub         r0, r0, r1, lsl #3
-    add         r0, r0, r1
-    add         r2, r0, r1
-    add         r3, r2, r1
+    sub         r12, r12, r1, lsl #2
+    add         r0, r12, r1, lsr #1

-    vst1.u8     {q4}, [r0]                  ; store op2
-    vst1.u8     {q5}, [r2]                  ; store op1
-    vst1.u8     {q6}, [r3], r1              ; store op0
-    add         r12, r3, r1
-    vst1.u8     {q7}, [r3]                  ; store oq0
-    vst1.u8     {q8}, [r12], r1             ; store oq1
-    vst1.u8     {q9}, [r12]             ; store oq2
+    vst1.u8     {q4}, [r12@128],r1         ; store op2
+    vst1.u8     {q5}, [r0@128],r1          ; store op1
+    vst1.u8     {q6}, [r12@128], r1        ; store op0
+    vst1.u8     {q7}, [r0@128],r1          ; store oq0
+    vst1.u8     {q8}, [r12@128]            ; store oq1
+    vst1.u8     {q9}, [r0@128]             ; store oq2

-    ldmia       sp!, {pc}
+    pop         {pc}
    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|

 ; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-;                                                const signed char *flimit,
-;                                                const signed char *limit,
-;                                                const signed char *thresh,
+;                                                const unsigned char *blimit,
+;                                                const unsigned char *limit,
+;                                                const unsigned char *thresh,
 ;                                                unsigned char *v)
 ; r0    unsigned char *u,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 ; sp+4  unsigned char *v
+
 |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
+    push        {lr}
+    ldr         r12, [sp, #4]                 ; load thresh
    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #8]                ; load v ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
-    sub         r3, r3, r1, lsl #2          ; move v pointer down by 4 lines
+    vdup.u8     q2, r12                       ; thresh
+    ldr         r12, [sp, #8]                 ; load v ptr
+    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines

-    vld1.u8     {d6}, [r0], r1              ; p3
-    vld1.u8     {d7}, [r3], r1              ; p3
-    vld1.u8     {d8}, [r0], r1              ; p2
-    vld1.u8     {d9}, [r3], r1              ; p2
-    vld1.u8     {d10}, [r0], r1             ; p1
-    vld1.u8     {d11}, [r3], r1             ; p1
-    vld1.u8     {d12}, [r0], r1             ; p0
-    vld1.u8     {d13}, [r3], r1             ; p0
-    vld1.u8     {d14}, [r0], r1             ; q0
-    vld1.u8     {d15}, [r3], r1             ; q0
-    vld1.u8     {d16}, [r0], r1             ; q1
-    vld1.u8     {d17}, [r3], r1             ; q1
-    vld1.u8     {d18}, [r0], r1             ; q2
-    vld1.u8     {d19}, [r3], r1             ; q2
-    vld1.u8     {d20}, [r0], r1             ; q3
-    vld1.u8     {d21}, [r3], r1             ; q3
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {d6}, [r0@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1              ; p3
+    vld1.u8     {d8}, [r0@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1              ; p2
+    vld1.u8     {d10}, [r0@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1             ; p1
+    vld1.u8     {d12}, [r0@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1             ; p0
+    vld1.u8     {d14}, [r0@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1             ; q0
+    vld1.u8     {d16}, [r0@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1             ; q1
+    vld1.u8     {d18}, [r0@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1             ; q2
+    vld1.u8     {d20}, [r0@64], r1             ; q3
+    vld1.u8     {d21}, [r12@64], r1             ; q3

    bl          vp8_mbloop_filter_neon

    sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3
+    sub         r12, r12, r1, lsl #3

    add         r0, r0, r1
-    add         r3, r3, r1
+    add         r12, r12, r1

-    vst1.u8     {d8}, [r0], r1              ; store u op2
-    vst1.u8     {d9}, [r3], r1              ; store v op2
-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r3], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r3], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r3], r1             ; store v oq0
-    vst1.u8     {d16}, [r0], r1             ; store u oq1
-    vst1.u8     {d17}, [r3], r1             ; store v oq1
-    vst1.u8     {d18}, [r0], r1             ; store u oq2
-    vst1.u8     {d19}, [r3], r1             ; store v oq2
+    vst1.u8     {d8}, [r0@64], r1              ; store u op2
+    vst1.u8     {d9}, [r12@64], r1              ; store v op2
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r12@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r12@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
+    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
+    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
+    vst1.u8     {d19}, [r12@64], r1             ; store v oq2

-    ldmia       sp!, {pc}
+    pop         {pc}
    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|

 ; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                             const signed char *flimit,
-;                                             const signed char *limit,
-;                                             const signed char *thresh,
-;                                             int count)
+;                                             const unsigned char *blimit,
+;                                             const unsigned char *limit,
+;                                             const unsigned char *thresh)
 ; r0    unsigned char *src,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_mbloop_filter_vertical_edge_y_neon| PROC
-    stmdb       sp!, {lr}
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    vdup.s8     q2, r12                     ; thresh
+    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines

    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
    vld1.u8     {d8}, [r0], r1
-    sub         sp, sp, #32
+    vld1.u8     {d9}, [r12], r1
    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
    vld1.u8     {d20}, [r0], r1
-
-    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r0], r1
-    vld1.u8     {d11}, [r0], r1
-    vld1.u8     {d13}, [r0], r1
-    vld1.u8     {d15}, [r0], r1
-    vld1.u8     {d17}, [r0], r1
-    vld1.u8     {d19}, [r0], r1
-    vld1.u8     {d21}, [r0], r1
+    vld1.u8     {d21}, [r12], r1

    ;transpose to 8x16 matrix
    vtrn.32     q3, q7
@@ -180,133 +168,11 @@
    vtrn.8      q7, q8
    vtrn.8      q9, q10

-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    mov         r12, sp
-    vst1.u8     {q3}, [r12]!
-    vst1.u8     {q10}, [r12]!
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r0, r0, r1, lsl #4
-
-    add         r2, r0, r1
-
-    add         r3, r2, r1
-
-    vld1.u8     {q3}, [sp]!
-    vld1.u8     {q10}, [sp]!
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-    add         r12, r3, r1
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0]
-    vst1.8      {d8}, [r2]
-    vst1.8      {d10}, [r3]
-    vst1.8      {d12}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d14}, [r12]
-    vst1.8      {d16}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d18}, [r0]
-    vst1.8      {d20}, [r2], r1
-    add         r3, r2, r1
-    vst1.8      {d7}, [r2]
-    vst1.8      {d9}, [r3], r1
-    add         r12, r3, r1
-    vst1.8      {d11}, [r3]
-    vst1.8      {d13}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d15}, [r12]
-    vst1.8      {d17}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d19}, [r0]
-    vst1.8      {d21}, [r2]
-
-    ldmia       sp!, {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-;                                              const signed char *flimit,
-;                                              const signed char *limit,
-;                                              const signed char *thresh,
-;                                              unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #8]                ; load v ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
-
-    sub         r3, r3, #4                  ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r3], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         sp, sp, #32
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    mov         r12, sp
-    vst1.u8     {q3}, [r12]!
-    vst1.u8     {q10}, [r12]!
-
-    bl          vp8_mbloop_filter_neon
-
    sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3

-    vld1.u8     {q3}, [sp]!
-    vld1.u8     {q10}, [sp]!
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3

    ;transpose to 16x8 matrix
    vtrn.32     q3, q7
@@ -326,23 +192,118 @@

    ;store op2, op1, op0, oq0, oq1, oq2
    vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r3], r1
+    vst1.8      {d7}, [r12], r1
    vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r3], r1
+    vst1.8      {d9}, [r12], r1
    vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r3], r1
+    vst1.8      {d11}, [r12], r1
    vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r3], r1
+    vst1.8      {d13}, [r12], r1
    vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r3], r1
+    vst1.8      {d15}, [r12], r1
    vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r3], r1
+    vst1.8      {d17}, [r12], r1
    vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r3], r1
-    vst1.8      {d20}, [r0], r1
-    vst1.8      {d21}, [r3], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]

-    ldmia       sp!, {pc}
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+;                                              const unsigned char *blimit,
+;                                              const unsigned char *limit,
+;                                              const unsigned char *thresh,
+;                                              unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, #4                  ; move u pointer down by 4 columns
+    vdup.u8     q2, r12                     ; thresh
+    ldr         r12, [sp, #8]               ; load v ptr
+    sub         r12, r12, #4                ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r12], r1             ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         r0, r0, r1, lsl #3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r12], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r12], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r12], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r12], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r12], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r12], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|

 ; void vp8_mbloop_filter_neon()
@@ -350,41 +311,33 @@
 ; functions do the necessary load, transpose (if necessary), preserve (if
 ; necessary) and store.

-; TODO:
-; The vertical filter writes p3/q3 back out because two 4 element writes are
-; much simpler than ordering and writing two 3 element sets (or three 2 elements
-; sets, or whichever other combinations are possible).
-; If we can preserve q3 and q10, the vertical filter will be able to avoid
-; storing those values on the stack and reading them back after the filter.
-
 ; r0,r1 PRESERVE
-; r2    flimit
-; r3    PRESERVE
-; q1    limit
+; r2    mblimit
+; r3    limit
+
 ; q2    thresh
-; q3    p3
+; q3    p3 PRESERVE
 ; q4    p2
 ; q5    p1
 ; q6    p0
 ; q7    q0
 ; q8    q1
 ; q9    q2
-; q10   q3
+; q10   q3 PRESERVE

 |vp8_mbloop_filter_neon| PROC
-    ldr         r12, _mblf_coeff_

    ; vp8_filter_mask
    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)

    vmax.u8     q11, q11, q12
    vmax.u8     q12, q13, q14
-    vmax.u8     q3, q3, q0
+    vmax.u8     q1, q1, q0
    vmax.u8     q15, q11, q12

    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
@@ -392,51 +345,53 @@
    ; vp8_hevmask
    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
-    vmax.u8     q15, q15, q3
+    vmax.u8     q15, q15, q1

-    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+    vdup.u8     q1, r3                      ; limit
+    vdup.u8     q2, r2                      ; mblimit

-    vld1.u8     {q0}, [r12]!
+    vmov.u8     q0, #0x80                   ; 0x80

-    vadd.u8     q2, q2, q2                  ; flimit * 2
-    vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
    vcge.u8     q15, q1, q15

    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
-    vshr.u8     q1, q1, #1                  ; a = a / 2
-    vqadd.u8    q12, q12, q1                ; a = b + a
-    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+    vmov.u16    q11, #3                     ; #3

    ; vp8_filter
    ; convert to signed
    veor        q7, q7, q0                  ; qs0
+    vshr.u8     q1, q1, #1                  ; a = a / 2
    veor        q6, q6, q0                  ; ps0
    veor        q5, q5, q0                  ; ps1
+
+    vqadd.u8    q12, q12, q1                ; a = b + a
+
    veor        q8, q8, q0                  ; qs1
    veor        q4, q4, q0                  ; ps2
    veor        q9, q9, q0                  ; qs2

    vorr        q14, q13, q14               ; vp8_hevmask

+    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+
    vsubl.s8    q2, d14, d12                ; qs0 - ps0
    vsubl.s8    q13, d15, d13

    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)

-    vadd.s16    q10, q2, q2                 ; 3 * (qs0 - ps0)
-    vadd.s16    q11, q13, q13
+    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
+
    vand        q15, q15, q12               ; vp8_filter_mask

-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q11
+    vmul.i16    q13, q13, q11

-    vld1.u8     {q12}, [r12]!               ; #3
+    vmov.u8     q12, #3                     ; #3

    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
    vaddw.s8    q13, q13, d3

-    vld1.u8     {q11}, [r12]!               ; #4
+    vmov.u8     q11, #4                     ; #4

    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
    vqmovn.s16  d2, q2
@@ -444,27 +399,23 @@

    vand        q1, q1, q15                 ; vp8_filter &= mask

-    vld1.u8     {q15}, [r12]!               ; #63
-    ;
-    vand        q13, q1, q14                ; Filter2 &= hev
+    vmov.u16    q15, #63                    ; #63

-    vld1.u8     {d7}, [r12]!                ; #9
+    vand        q13, q1, q14                ; Filter2 &= hev

    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)

-    vld1.u8     {d6}, [r12]!                ; #18
+    vmov        q0, q15

    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
    vshr.s8     q13, q13, #3                ; Filter2 >>= 3

-    vmov        q10, q15
+    vmov        q11, q15
    vmov        q12, q15

    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)

-    vld1.u8     {d5}, [r12]!                ; #27
-
    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)

    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
@@ -472,48 +423,47 @@
    ; roughly 1/7th difference across boundary
    ; roughly 2/7th difference across boundary
    ; roughly 3/7th difference across boundary
-    vmov        q11, q15
+
+    vmov.u8     d5, #9                      ; #9
+    vmov.u8     d4, #18                     ; #18
+
    vmov        q13, q15
    vmov        q14, q15

-    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
-    vmlal.s8    q11, d3, d7
-    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
-    vmlal.s8    q13, d3, d6
-    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
+    vmlal.s8    q11, d3, d5
+    vmov.u8     d5, #27                     ; #27
+    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
+    vmlal.s8    q13, d3, d4
+    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
    vmlal.s8    q15, d3, d5
-    vqshrn.s16  d20, q10, #7                ; u = clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d21, q11, #7
+
+    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d1, q11, #7
    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
    vqshrn.s16  d25, q13, #7
    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
    vqshrn.s16  d29, q15, #7

-    vqsub.s8    q11, q9, q10                ; s = clamp(qs2 - u)
-    vqadd.s8    q10, q4, q10                ; s = clamp(ps2 + u)
+    vmov.u8     q1, #0x80                   ; 0x80
+
+    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
+    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
-    veor        q9, q11, q0                 ; *oq2 = s^0x80
-    veor        q4, q10, q0                 ; *op2 = s^0x80
-    veor        q8, q13, q0                 ; *oq1 = s^0x80
-    veor        q5, q12, q0                 ; *op2 = s^0x80
-    veor        q7, q15, q0                 ; *oq0 = s^0x80
-    veor        q6, q14, q0                 ; *op0 = s^0x80
+
+    veor        q9, q11, q1                 ; *oq2 = s^0x80
+    veor        q4, q0, q1                  ; *op2 = s^0x80
+    veor        q8, q13, q1                 ; *oq1 = s^0x80
+    veor        q5, q12, q1                 ; *op2 = s^0x80
+    veor        q7, q15, q1                 ; *oq0 = s^0x80
+    veor        q6, q14, q1                 ; *op0 = s^0x80

    bx          lr
    ENDP        ; |vp8_mbloop_filter_neon|

-    AREA    mbloopfilter_dat, DATA, READONLY
-_mblf_coeff_
-    DCD     mblf_coeff
-mblf_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
-    DCD     0x1b1b1b1b, 0x1b1b1b1b
+;-----------------

    END
--- a/vp8/common/arm/neon/recon_neon.c
+++ b/vp8/common/arm/neon/recon_neon.c
@@ -10,8 +10,8 @@


 #include "vpx_ports/config.h"
-#include "recon.h"
-#include "blockd.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/blockd.h"

 extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);

--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -31,7 +31,7 @@
 ;result of the multiplication that is needed in IDCT.

 |vp8_short_idct4x4llm_neon| PROC
-    ldr             r12, _idct_coeff_
+    adr             r12, idct_coeff
    vld1.16         {q1, q2}, [r0]
    vld1.16         {d0}, [r12]

@@ -113,12 +113,7 @@
    ENDP

 ;-----------------
-    AREA    idct4x4_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_idct_coeff_
-    DCD     idct_coeff
+
 idct_coeff
    DCD     0x4e7b4e7b, 0x8a8c8a8c

--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -15,6 +15,17 @@
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter16_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
@@ -33,7 +44,7 @@
 |vp8_sixtap_predict16x16_neon| PROC
    push            {r4-r5, lr}

-    ldr             r12, _filter16_coeff_
+    adr             r12, filter16_coeff
    ldr             r4, [sp, #12]           ;load parameters from stack
    ldr             r5, [sp, #16]           ;load parameters from stack

@@ -476,20 +487,4 @@ secondpass_only_inner_loop_neon
    ENDP

 ;-----------------
-    AREA    subpelfilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_filter16_coeff_
-    DCD     filter16_coeff
-filter16_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
    END
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -15,6 +15,17 @@
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter4_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
@@ -25,7 +36,7 @@
 |vp8_sixtap_predict_neon| PROC
    push            {r4, lr}

-    ldr             r12, _filter4_coeff_
+    adr             r12, filter4_coeff
    ldr             r4, [sp, #8]            ;load parameters from stack
    ldr             lr, [sp, #12]           ;load parameters from stack

@@ -407,20 +418,5 @@ secondpass_filter4x4_only
    ENDP

 ;-----------------
-    AREA    subpelfilters4_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_filter4_coeff_
-    DCD     filter4_coeff
-filter4_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0

    END
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -15,6 +15,17 @@
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
@@ -25,7 +36,7 @@
 |vp8_sixtap_predict8x4_neon| PROC
    push            {r4-r5, lr}

-    ldr             r12, _filter8_coeff_
+    adr             r12, filter8_coeff
    ldr             r4, [sp, #12]           ;load parameters from stack
    ldr             r5, [sp, #16]           ;load parameters from stack

@@ -458,20 +469,5 @@ secondpass_filter8x4_only
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_filter8_coeff_
-    DCD     filter8_coeff
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0

    END
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -15,6 +15,17 @@
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
@@ -25,7 +36,7 @@
 |vp8_sixtap_predict8x8_neon| PROC
    push            {r4-r5, lr}

-    ldr             r12, _filter8_coeff_
+    adr             r12, filter8_coeff

    ldr             r4, [sp, #12]           ;load parameters from stack
    ldr             r5, [sp, #16]           ;load parameters from stack
@@ -509,20 +520,5 @@ filt_blk2d_spo8x8_loop_neon
    ENDP

 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-_filter8_coeff_
-    DCD     filter8_coeff
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0

    END
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h
@@ -53,6 +53,9 @@ extern prototype_copy_block(vp8_copy_mem16x16_neon);

 extern prototype_recon_macroblock(vp8_recon_mb_neon);

+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
+
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon
 #define vp8_recon_recon vp8_recon_b_neon
@@ -74,6 +77,13 @@ extern prototype_recon_macroblock(vp8_recon_mb_neon);

 #undef  vp8_recon_recon_mb
 #define vp8_recon_recon_mb vp8_recon_mb_neon
+
+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
+
 #endif
 #endif

--- a/vp8/common/arm/reconintra_arm.c
+++ b/vp8/common/arm/reconintra_arm.c
@@ -10,10 +10,10 @@


 #include "vpx_ports/config.h"
-#include "blockd.h"
-#include "reconintra.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/reconintra.h"
 #include "vpx_mem/vpx_mem.h"
-#include "recon.h"
+#include "vp8/common/recon.h"

 #if HAVE_ARMV7
 extern void vp8_build_intra_predictors_mby_neon_func(
--- a/vp8/common/arm/vpx_asm_offsets.c
+++ b/vp8/common/arm/vpx_asm_offsets.c
@@ -1,87 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include <stddef.h>
-
-#if CONFIG_VP8_ENCODER
-#include "vpx_scale/yv12config.h"
-#endif
-
-#if CONFIG_VP8_DECODER
-#include "onyxd_int.h"
-#endif
-
-#define DEFINE(sym, val) int sym = val;
-
-/*
-#define BLANK() asm volatile("\n->" : : )
-*/
-
-/*
- * int main(void)
- * {
- */
-
-#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-#endif
-
-#if CONFIG_VP8_DECODER
-DEFINE(mb_diff,                                 offsetof(MACROBLOCKD, diff));
-DEFINE(mb_predictor,                            offsetof(MACROBLOCKD, predictor));
-DEFINE(mb_dst_y_stride,                         offsetof(MACROBLOCKD, dst.y_stride));
-DEFINE(mb_dst_y_buffer,                         offsetof(MACROBLOCKD, dst.y_buffer));
-DEFINE(mb_dst_u_buffer,                         offsetof(MACROBLOCKD, dst.u_buffer));
-DEFINE(mb_dst_v_buffer,                         offsetof(MACROBLOCKD, dst.v_buffer));
-DEFINE(mb_up_available,                         offsetof(MACROBLOCKD, up_available));
-DEFINE(mb_left_available,                       offsetof(MACROBLOCKD, left_available));
-
-DEFINE(detok_scan,                              offsetof(DETOK, scan));
-DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
-DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
-DEFINE(detok_teb_base_ptr,                      offsetof(DETOK, teb_base_ptr));
-DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));
-DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));
-
-DEFINE(detok_A,                                 offsetof(DETOK, A));
-DEFINE(detok_L,                                 offsetof(DETOK, L));
-
-DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
-DEFINE(detok_current_bc,                        offsetof(DETOK, current_bc));
-DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));
-DEFINE(detok_eob,                               offsetof(DETOK, eob));
-
-DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
-
-DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));
-DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
-#endif
-
-//add asserts for any offset that is not supported by assembly code
-//add asserts for any size that is not supported by assembly code
-/*
- * return 0;
- * }
- */
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_scale/yv12config.h"
+
+BEGIN
+
+/* vpx_scale */
+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_ARMV7
+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
+#endif
--- a/vp8/common/blockd.c
+++ b/vp8/common/blockd.c
@@ -12,8 +12,6 @@
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"

-const int vp8_block2type[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1};
-
 const unsigned char vp8_block2left[25] =
 {
    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -28,11 +28,6 @@ void vpx_log(const char *format, ...);
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3

-#define Y1CONTEXT 0
-#define UCONTEXT 1
-#define VCONTEXT 2
-#define Y2CONTEXT 3
-
 #define MB_FEATURE_TREE_PROBS   3
 #define MAX_MB_SEGMENTS         4

@@ -48,6 +43,11 @@ typedef struct
    int r, c;
 } POS;

+#define PLANE_TYPE_Y_NO_DC    0
+#define PLANE_TYPE_Y2         1
+#define PLANE_TYPE_UV         2
+#define PLANE_TYPE_Y_WITH_DC  3
+

 typedef char ENTROPY_CONTEXT;
 typedef struct
@@ -58,8 +58,6 @@ typedef struct
    ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;

-extern const int vp8_block2type[25];
-
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];

@@ -139,16 +137,11 @@ typedef enum
   modes for the Y blocks to the left and above us; for interframes, there
   is a single probability table. */

-typedef struct
+union b_mode_info
 {
-    B_PREDICTION_MODE mode;
-    union
-    {
-        int as_int;
-        MV  as_mv;
-    } mv;
-} B_MODE_INFO;
-
+    B_PREDICTION_MODE as_mode;
+    int_mv mv;
+};

 typedef enum
 {
@@ -163,38 +156,26 @@ typedef struct
 {
    MB_PREDICTION_MODE mode, uv_mode;
    MV_REFERENCE_FRAME ref_frame;
-    union
-    {
-        int as_int;
-        MV  as_mv;
-    } mv;
+    int_mv mv;

    unsigned char partitioning;
    unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-    unsigned char dc_diff;
    unsigned char need_to_clamp_mvs;
-
    unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
-
-    unsigned char force_no_skip; /* encoder only */
 } MB_MODE_INFO;

-
 typedef struct
 {
    MB_MODE_INFO mbmi;
-    B_MODE_INFO bmi[16];
+    union b_mode_info bmi[16];
 } MODE_INFO;

-
 typedef struct
 {
    short *qcoeff;
    short *dqcoeff;
    unsigned char  *predictor;
    short *diff;
-    short *reference;
-
    short *dequant;

    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
@@ -208,15 +189,13 @@ typedef struct

    int eob;

-    B_MODE_INFO bmi;
-
+    union b_mode_info bmi;
 } BLOCKD;

-typedef struct
+typedef struct MacroBlockD
 {
    DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-/* not used    DECLARE_ALIGNED(16, short, reference[384]); */
    DECLARE_ALIGNED(16, short, qcoeff[400]);
    DECLARE_ALIGNED(16, short, dqcoeff[400]);
    DECLARE_ALIGNED(16, char,  eobs[25]);
@@ -273,6 +252,9 @@ typedef struct
    int mb_to_top_edge;
    int mb_to_bottom_edge;

+    int ref_frame_cost[MAX_REF_FRAMES];
+
+
    unsigned int frames_since_golden;
    unsigned int frames_till_alt_ref_frame;
    vp8_subpix_fn_t  subpixel_predict;
@@ -282,6 +264,16 @@ typedef struct

    void *current_bc;

+    int corrupted;
+
+#if ARCH_X86 || ARCH_X86_64
+    /* This is an intermediate buffer currently used in sub-pixel motion search
+     * to keep a copy of the reference area. This buffer can be used for other
+     * purpose.
+     */
+    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
+#endif
+
 #if CONFIG_RUNTIME_CPU_DETECT
    struct VP8_COMMON_RTCD  *rtcd;
 #endif
@@ -291,4 +283,20 @@ typedef struct
 extern void vp8_build_block_doffsets(MACROBLOCKD *x);
 extern void vp8_setup_block_dptrs(MACROBLOCKD *x);

+static void update_blockd_bmi(MACROBLOCKD *xd)
+{
+    int i;
+    int is_4x4;
+    is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
+              (xd->mode_info_context->mbmi.mode == B_PRED);
+
+    if (is_4x4)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            xd->block[i].bmi = xd->mode_info_context->bmi[i];
+        }
+    }
+}
+
 #endif  /* __INC_BLOCKD_H */
--- a/vp8/common/boolcoder.h
+++ b/vp8/common/boolcoder.h
@@ -1,570 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef bool_coder_h
-#define bool_coder_h 1
-
-/* Arithmetic bool coder with largish probability range.
-   Timothy S Murphy  6 August 2004 */
-
-/* So as not to force users to drag in too much of my idiosyncratic C++ world,
-   I avoid fancy storage management. */
-
-#include <assert.h>
-
-#include <stddef.h>
-#include <stdio.h>
-
-typedef unsigned char vp8bc_index_t; // probability index
-
-/* There are a couple of slight variants in the details of finite-precision
-   arithmetic coding.  May be safely ignored by most users. */
-
-enum vp8bc_rounding
-{
-    vp8bc_down = 0,     // just like VP8
-    vp8bc_down_full = 1, // handles minimum probability correctly
-    vp8bc_up = 2
-};
-
-#if _MSC_VER
-
-/* Note that msvc by default does not inline _anything_ (regardless of the
-   setting of inline_depth) and that a command-line option (-Ob1 or -Ob2)
-   is required to inline even the smallest functions. */
-
-#   pragma inline_depth( 255)           // I mean it when I inline something
-#   pragma warning( disable : 4099)     // No class vs. struct harassment
-#   pragma warning( disable : 4250)     // dominance complaints
-#   pragma warning( disable : 4284)     // operator-> in templates
-#   pragma warning( disable : 4800)     // bool conversion
-
-// don't let prefix ++,-- stand in for postfix, disaster would ensue
-
-#   pragma warning( error : 4620 4621)
-
-#endif  // _MSC_VER
-
-
-#if __cplusplus
-
-// Sometimes one wishes to be definite about integer lengths.
-
-struct int_types
-{
-    typedef const bool cbool;
-    typedef const signed char cchar;
-    typedef const short cshort;
-    typedef const int cint;
-    typedef const int clong;
-
-    typedef const double cdouble;
-    typedef const size_t csize_t;
-
-    typedef unsigned char uchar;    // 8 bits
-    typedef const uchar cuchar;
-
-    typedef short int16;
-    typedef unsigned short uint16;
-    typedef const int16 cint16;
-    typedef const uint16 cuint16;
-
-    typedef int int32;
-    typedef unsigned int uint32;
-    typedef const int32 cint32;
-    typedef const uint32 cuint32;
-
-    typedef unsigned int uint;
-    typedef unsigned int ulong;
-    typedef const uint cuint;
-    typedef const ulong culong;
-
-
-    // All structs consume space, may as well have a vptr.
-
-    virtual ~int_types();
-};
-
-
-struct bool_coder_spec;
-struct bool_coder;
-struct bool_writer;
-struct bool_reader;
-
-
-struct bool_coder_namespace : int_types
-{
-    typedef vp8bc_index_t Index;
-    typedef bool_coder_spec Spec;
-    typedef const Spec c_spec;
-
-    enum Rounding
-    {
-        Down = vp8bc_down,
-        down_full = vp8bc_down_full,
-        Up = vp8bc_up
-    };
-};
-
-
-// Archivable specification of a bool coder includes rounding spec
-// and probability mapping table.  The latter replaces a uchar j
-// (0 <= j < 256) with an arbitrary uint16 tbl[j] = p.
-// p/65536 is then the probability of a zero.
-
-struct bool_coder_spec : bool_coder_namespace
-{
-    friend struct bool_coder;
-    friend struct bool_writer;
-    friend struct bool_reader;
-    friend struct bool_coder_spec_float;
-    friend struct bool_coder_spec_explicit_table;
-    friend struct bool_coder_spec_exponential_table;
-    friend struct BPsrc;
-private:
-    uint w;                 // precision
-    Rounding r;
-
-    uint ebits, mbits, ebias;
-    uint32 mmask;
-
-    Index max_index, half_index;
-
-    uint32 mantissa(Index i) const
-    {
-        assert(i < half_index);
-        return (1 << mbits) + (i & mmask);
-    }
-    uint exponent(Index i) const
-    {
-        assert(i < half_index);
-        return ebias - (i >> mbits);
-    }
-
-    uint16 Ptbl[256];       // kinda clunky, but so is storage management.
-
-    /* Cost in bits of encoding a zero at every probability, scaled by 2^20.
-       Assumes that index is at most 8 bits wide. */
-
-    uint32 Ctbl[256];
-
-    uint32 split(Index i, uint32 R) const    // 1 <= split <= max( 1, R-1)
-    {
-        if (!ebias)
-            return 1 + (((R - 1) * Ptbl[i]) >> 16);
-
-        if (i >= half_index)
-            return R - split(max_index - i, R);
-
-        return 1 + (((R - 1) * mantissa(i)) >> exponent(i));
-    }
-
-    uint32 max_range() const
-    {
-        return (1 << w) - (r == down_full ? 0 : 1);
-    }
-    uint32 min_range() const
-    {
-        return (1 << (w - 1)) + (r == down_full ? 1 : 0);
-    }
-    uint32 Rinc() const
-    {
-        return r == Up ? 1 : 0;
-    }
-
-    void check_prec() const;
-
-    bool float_init(uint Ebits, uint Mbits);
-
-    void cost_init();
-
-    bool_coder_spec(
-        uint prec, Rounding rr, uint Ebits = 0, uint Mbits = 0
-    )
-        : w(prec), r(rr)
-    {
-        float_init(Ebits, Mbits);
-    }
-public:
-    // Read complete spec from file.
-    bool_coder_spec(FILE *);
-
-    // Write spec to file.
-    void dump(FILE *) const;
-
-    // return probability index best approximating prob.
-    Index operator()(double prob) const;
-
-    // probability corresponding to index
-    double operator()(Index i) const;
-
-    Index complement(Index i) const
-    {
-        return max_index - i;
-    }
-
-    Index max_index() const
-    {
-        return max_index;
-    }
-    Index half_index() const
-    {
-        return half_index;
-    }
-
-    uint32 cost_zero(Index i) const
-    {
-        return Ctbl[i];
-    }
-    uint32 cost_one(Index i) const
-    {
-        return Ctbl[ max_index - i];
-    }
-    uint32 cost_bit(Index i, bool b) const
-    {
-        return Ctbl[b? max_index-i:i];
-    }
-};
-
-
-/* Pseudo floating-point probability specification.
-
-   At least one of Ebits and Mbits must be nonzero.
-
-   Since all arithmetic is done at 32 bits, Ebits is at most 5.
-
-   Total significant bits in index is Ebits + Mbits + 1.
-
-   Below the halfway point (i.e. when the top significant bit is 0),
-   the index is (e << Mbits) + m.
-
-   The exponent e is between 0 and (2**Ebits) - 1,
-   the mantissa m is between 0 and (2**Mbits) - 1.
-
-   Prepending an implicit 1 to the mantissa, the probability is then
-
-        (2**Mbits + m) >> (e - 2**Ebits - 1 - Mbits),
-
-   which has (1/2)**(2**Ebits + 1) as a minimum
-   and (1/2) * [1 - 2**(Mbits + 1)] as a maximum.
-
-   When the index is above the halfway point, the probability is the
-   complement of the probability associated to the complement of the index.
-
-   Note that the probability increases with the index and that, because of
-   the symmetry, we cannot encode probability exactly 1/2; though we
-   can get as close to 1/2 as we like, provided we have enough Mbits.
-
-   The latter is of course not a problem in practice, one never has
-   exact probabilities and entropy errors are second order, that is, the
-   "overcoding" of a zero will be largely compensated for by the
-   "undercoding" of a one (or vice-versa).
-
-   Compared to arithmetic probability specs (a la VP8), this will do better
-   at very high and low probabilities and worse at probabilities near 1/2,
-   as well as facilitating the usage of wider or narrower probability indices.
-*/
-
-struct bool_coder_spec_float : bool_coder_spec
-{
-    bool_coder_spec_float(
-        uint Ebits = 3, uint Mbits = 4, Rounding rr = down_full, uint prec = 12
-    )
-        : bool_coder_spec(prec, rr, Ebits, Mbits)
-    {
-        cost_init();
-    }
-};
-
-
-struct bool_coder_spec_explicit_table : bool_coder_spec
-{
-    bool_coder_spec_explicit_table(
-        cuint16 probability_table[256] = 0,  // default is tbl[i] = i << 8.
-        Rounding = down_full,
-        uint precision = 16
-    );
-};
-
-// Contruct table via multiplicative interpolation between
-// p[128] = 1/2  and p[0] = (1/2)^x.
-// Since we are working with 16-bit precision, x is at most 16.
-// For probabilities to increase with i, we must have x > 1.
-// For 0 <= i <= 128, p[i] = (1/2)^{ 1 + [1 - (i/128)]*[x-1] }.
-// Finally, p[128+i] = 1 - p[128 - i].
-
-struct bool_coder_spec_exponential_table : bool_coder_spec
-{
-    bool_coder_spec_exponential_table(uint x, Rounding = down_full, uint prec = 16);
-};
-
-
-// Commonalities between writer and reader.
-
-struct bool_coder : bool_coder_namespace
-{
-    friend struct bool_writer;
-    friend struct bool_reader;
-    friend struct BPsrc;
-private:
-    uint32 Low, Range;
-    cuint32 min_range;
-    cuint32 rinc;
-    c_spec spec;
-
-    void _reset()
-    {
-        Low = 0;
-        Range = spec.max_range();
-    }
-
-    bool_coder(c_spec &s)
-        :  min_range(s.min_range()),
-           rinc(s.Rinc()),
-           spec(s)
-    {
-        _reset();
-    }
-
-    uint32 half() const
-    {
-        return 1 + ((Range - 1) >> 1);
-    }
-public:
-    c_spec &Spec() const
-    {
-        return spec;
-    }
-};
-
-
-struct bool_writer : bool_coder
-{
-    friend struct BPsrc;
-private:
-    uchar *Bstart, *Bend, *B;
-    int bit_lag;
-    bool is_toast;
-    void carry();
-    void reset()
-    {
-        _reset();
-        bit_lag = 32 - spec.w;
-        is_toast = 0;
-    }
-    void raw(bool value, uint32 split);
-public:
-    bool_writer(c_spec &, uchar *Dest, size_t Len);
-    virtual ~bool_writer();
-
-    void operator()(Index p, bool v)
-    {
-        raw(v, spec.split(p, Range));
-    }
-
-    uchar *buf() const
-    {
-        return Bstart;
-    }
-    size_t bytes_written() const
-    {
-        return B - Bstart;
-    }
-
-    // Call when done with input, flushes internal state.
-    // DO NOT write any more data after calling this.
-
-    bool_writer &flush();
-
-    void write_bits(int n, uint val)
-    {
-        if (n)
-        {
-            uint m = 1 << (n - 1);
-
-            do
-            {
-                raw((bool)(val & m), half());
-            }
-            while (m >>= 1);
-        }
-    }
-
-#   if 0
-    // We are agnostic about storage management.
-    // By default, overflows throw an assert but user can
-    // override to provide an expanding buffer using ...
-
-    virtual void overflow(uint Len) const;
-
-    // ... this function copies already-written data into new buffer
-    // and retains new buffer location.
-
-    void new_buffer(uchar *dest, uint Len);
-
-    // Note that storage management is the user's responsibility.
-#   endif
-};
-
-
-// This could be adjusted to use a little less lookahead.
-
-struct bool_reader : bool_coder
-{
-    friend struct BPsrc;
-private:
-    cuchar *const Bstart;   // for debugging
-    cuchar *B;
-    cuchar *const Bend;
-    cuint shf;
-    uint bct;
-    bool raw(uint32 split);
-public:
-    bool_reader(c_spec &s, cuchar *src, size_t Len);
-
-    bool operator()(Index p)
-    {
-        return raw(spec.split(p, Range));
-    }
-
-    uint read_bits(int num_bits)
-    {
-        uint v = 0;
-
-        while (--num_bits >= 0)
-            v += v + (raw(half()) ? 1 : 0);
-
-        return v;
-    }
-};
-
-extern "C" {
-
-#endif /*  __cplusplus */
-
-
-    /* C interface */
-
-    typedef struct bool_coder_spec bool_coder_spec;
-    typedef struct bool_writer bool_writer;
-    typedef struct bool_reader bool_reader;
-
-    typedef const bool_coder_spec c_bool_coder_spec;
-    typedef const bool_writer c_bool_writer;
-    typedef const bool_reader c_bool_reader;
-
-
-    /* Optionally override default precision when constructing coder_specs.
-       Just pass a zero pointer if you don't care.
-       Precision is at most 16 bits for table specs, at most 23 otherwise. */
-
-    struct vp8bc_prec
-    {
-        enum vp8bc_rounding r;      /* see top header file for def */
-        unsigned int prec;          /* range precision in bits */
-    };
-
-    typedef const struct vp8bc_prec vp8bc_c_prec;
-
-    /* bool_coder_spec contains mapping of uchars to actual probabilities
-       (16 bit uints) as well as (usually immaterial) selection of
-       exact finite-precision algorithm used (for now, the latter can only
-       be overridden using the C++ interface).
-       See comments above the corresponding C++ constructors for discussion,
-       especially of exponential probability table generation. */
-
-    bool_coder_spec *vp8bc_vp8spec(); // just like vp8
-
-    bool_coder_spec *vp8bc_literal_spec(
-        const unsigned short prob_map[256],  // 0 is like vp8 w/more precision
-        vp8bc_c_prec*
-    );
-
-    bool_coder_spec *vp8bc_float_spec(
-        unsigned int exponent_bits, unsigned int mantissa_bits, vp8bc_c_prec*
-    );
-
-    bool_coder_spec *vp8bc_exponential_spec(unsigned int min_exp, vp8bc_c_prec *);
-
-    bool_coder_spec *vp8bc_spec_from_file(FILE *);
-
-
-    void vp8bc_destroy_spec(c_bool_coder_spec *);
-
-    void vp8bc_spec_to_file(c_bool_coder_spec *, FILE *);
-
-
-    /* Nearest index to supplied probability of zero, 0 <= prob <= 1. */
-
-    vp8bc_index_t vp8bc_index(c_bool_coder_spec *, double prob);
-
-    vp8bc_index_t vp8bc_index_from_counts(
-        c_bool_coder_spec *p, unsigned int zero_ct, unsigned int one_ct
-    );
-
-    /* In case you want to look */
-
-    double vp8bc_probability(c_bool_coder_spec *, vp8bc_index_t);
-
-    /* Opposite index */
-
-    vp8bc_index_t vp8bc_complement(c_bool_coder_spec *, vp8bc_index_t);
-
-    /* Cost in bits of encoding a zero at given probability, scaled by 2^20.
-       (assumes that an int holds at least 32 bits). */
-
-    unsigned int vp8bc_cost_zero(c_bool_coder_spec *, vp8bc_index_t);
-
-    unsigned int vp8bc_cost_one(c_bool_coder_spec *, vp8bc_index_t);
-    unsigned int vp8bc_cost_bit(c_bool_coder_spec *, vp8bc_index_t, int);
-
-
-    /* bool_writer interface */
-
-    /* Length = 0 disables checking for writes beyond buffer end. */
-
-    bool_writer *vp8bc_create_writer(
-        c_bool_coder_spec *, unsigned char *Destination, size_t Length
-    );
-
-    /* Flushes out any buffered data and returns total # of bytes written. */
-
-    size_t vp8bc_destroy_writer(bool_writer *);
-
-    void vp8bc_write_bool(bool_writer *, int boolean_val, vp8bc_index_t false_prob);
-
-    void vp8bc_write_bits(
-        bool_writer *, unsigned int integer_value, int number_of_bits
-    );
-
-    c_bool_coder_spec *vp8bc_writer_spec(c_bool_writer *);
-
-
-    /* bool_reader interface */
-
-    /* Length = 0 disables checking for reads beyond buffer end. */
-
-    bool_reader *vp8bc_create_reader(
-        c_bool_coder_spec *, const unsigned char *Source, size_t Length
-    );
-    void vp8bc_destroy_reader(bool_reader *);
-
-    int vp8bc_read_bool(bool_reader *, vp8bc_index_t false_prob);
-
-    unsigned int vp8bc_read_bits(bool_reader *, int number_of_bits);
-
-    c_bool_coder_spec *vp8bc_reader_spec(c_bool_reader *);
-
-#if __cplusplus
-}
-#endif
-
-#endif  /* bool_coder_h */
--- a/vp8/common/codec_common_interface.h
+++ b/vp8/common/codec_common_interface.h
@@ -1,93 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef CODEC_COMMON_INTERFACE_H
-#define CODEC_COMMON_INTERFACE_H
-
-#define __export
-#define _export
-#define dll_export   __declspec( dllexport )
-#define dll_import   __declspec( dllimport )
-
-// Playback ERROR Codes.
-#define NO_DECODER_ERROR            0
-#define REMOTE_DECODER_ERROR        -1
-
-#define DFR_BAD_DCT_COEFF           -100
-#define DFR_ZERO_LENGTH_FRAME       -101
-#define DFR_FRAME_SIZE_INVALID      -102
-#define DFR_OUTPUT_BUFFER_OVERFLOW  -103
-#define DFR_INVALID_FRAME_HEADER    -104
-#define FR_INVALID_MODE_TOKEN       -110
-#define ETR_ALLOCATION_ERROR        -200
-#define ETR_INVALID_ROOT_PTR        -201
-#define SYNCH_ERROR                 -400
-#define BUFFER_UNDERFLOW_ERROR      -500
-#define PB_IB_OVERFLOW_ERROR        -501
-
-// External error triggers
-#define PB_HEADER_CHECKSUM_ERROR    -601
-#define PB_DATA_CHECKSUM_ERROR      -602
-
-// DCT Error Codes
-#define DDCT_EXPANSION_ERROR        -700
-#define DDCT_INVALID_TOKEN_ERROR    -701
-
-// exception_errors
-#define GEN_EXCEPTIONS              -800
-#define EX_UNQUAL_ERROR             -801
-
-// Unrecoverable error codes
-#define FATAL_PLAYBACK_ERROR        -1000
-#define GEN_ERROR_CREATING_CDC      -1001
-#define GEN_THREAD_CREATION_ERROR   -1002
-#define DFR_CREATE_BMP_FAILED       -1003
-
-// YUV buffer configuration structure
-typedef struct
-{
-    int     y_width;
-    int     y_height;
-    int     y_stride;
-
-    int     uv_width;
-    int     uv_height;
-    int     uv_stride;
-
-    unsigned char   *y_buffer;
-    unsigned char   *u_buffer;
-    unsigned char   *v_buffer;
-
-} YUV_BUFFER_CONFIG;
-typedef enum
-{
-    C_SET_KEY_FRAME,
-    C_SET_FIXED_Q,
-    C_SET_FIRSTPASS_FILE,
-    C_SET_EXPERIMENTAL_MIN,
-    C_SET_EXPERIMENTAL_MAX = C_SET_EXPERIMENTAL_MIN + 255,
-    C_SET_CHECKPROTECT,
-    C_SET_TESTMODE,
-    C_SET_INTERNAL_SIZE,
-    C_SET_RECOVERY_FRAME,
-    C_SET_REFERENCEFRAME,
-    C_SET_GOLDENFRAME
-
-#ifndef VP50_COMP_INTERFACE
-    // Specialist test facilities.
-//    C_VCAP_PARAMS,              // DO NOT USE FOR NOW WITH VFW CODEC
-#endif
-
-} C_SETTING;
-
-typedef unsigned long C_SET_VALUE;
-
-
-#endif
--- a/vp8/common/coefupdateprobs.h
+++ b/vp8/common/coefupdateprobs.h
@@ -12,7 +12,7 @@
 /* Update probabilities for the nodes in the token entropy tree.
   Generated file included by entropy.c */

-const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] =
+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
 {
    {
        {
--- a/vp8/common/debugmodes.c
+++ b/vp8/common/debugmodes.c
@@ -97,7 +97,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
                bindex = (b_row & 3) * 4 + (b_col & 3);

                if (mi[mb_index].mbmi.mode == B_PRED)
-                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].mode);
+                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
                else
                    fprintf(mvs, "xx ");

--- a/vp8/common/defaultcoefcounts.c
+++ b/vp8/common/defaultcoefcounts.c
@@ -0,0 +1,225 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "defaultcoefcounts.h"
+
+/* Generated file, included by entropy.c */
+
+const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
+                                          [COEF_BANDS]
+                                          [PREV_COEF_CONTEXTS]
+                                          [MAX_ENTROPY_TOKENS] =
+{
+
+    {
+        /* Block Type ( 0 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        /* Block Type ( 1 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        /* Block Type ( 2 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        /* Block Type ( 3 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
--- a/vp8/common/defaultcoefcounts.h
+++ b/vp8/common/defaultcoefcounts.h
@@ -8,214 +8,14 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#ifndef __DEFAULTCOEFCOUNTS_H
+#define __DEFAULTCOEFCOUNTS_H

-/* Generated file, included by entropy.c */
+#include "entropy.h"

-static const unsigned int default_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] =
-{
+extern const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
+                                                 [COEF_BANDS]
+                                                 [PREV_COEF_CONTEXTS]
+                                                 [MAX_ENTROPY_TOKENS];

-    {
-        /* Block Type ( 0 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
-            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
-            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
-            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
-            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
-            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
-            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
-            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
-            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
-        },
-    },
-    {
-        /* Block Type ( 1 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
-            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
-            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
-            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
-            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
-            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
-            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
-            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
-            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
-            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
-            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
-            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
-            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
-            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
-            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
-        },
-    },
-    {
-        /* Block Type ( 2 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
-            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
-            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
-            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
-            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
-            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
-            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
-            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
-            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
-            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
-            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-        },
-    },
-    {
-        /* Block Type ( 3 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
-            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
-            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
-            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
-            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
-            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
-            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
-            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
-            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
-            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
-            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
-            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
-            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
-            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
-            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
-            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
-        },
-    },
-};
+#endif //__DEFAULTCOEFCOUNTS_H
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -26,8 +26,32 @@ typedef vp8_prob Prob;

 #include "coefupdateprobs.h"

-DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
-DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) =
+{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
+
+DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
+{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+
 DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
 {
    0,  1,  4,  8,
@@ -36,6 +60,14 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
    7, 11, 14, 15,
 };

+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+    1,  2,  6,  7,
+    3,  5,  8, 13,
+    4,  9, 12, 14,
+   10, 11, 15, 16
+};
+
 DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);

 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
@@ -57,7 +89,7 @@ const vp8_tree_index vp8_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
 };

-struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
+struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];

 /* Trees for extra bits.  Probabilities are constant and
   do not depend on previously encoded bits */
@@ -106,23 +138,20 @@ static void init_bit_trees()
    init_bit_tree(cat6, 11);
 }

-
-static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11];
-
 vp8_extra_bit_struct vp8_extra_bits[12] =
 {
-    { 0, 0, 0, 0, 0},
-    { 0, 0, 0, 0, 1},
-    { 0, 0, 0, 0, 2},
-    { 0, 0, 0, 0, 3},
-    { 0, 0, 0, 0, 4},
-    { cat1, Pcat1, bcc1, 1, 5},
-    { cat2, Pcat2, bcc2, 2, 7},
-    { cat3, Pcat3, bcc3, 3, 11},
-    { cat4, Pcat4, bcc4, 4, 19},
-    { cat5, Pcat5, bcc5, 5, 35},
-    { cat6, Pcat6, bcc6, 11, 67},
-    { 0, 0, 0, 0, 0}
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 1},
+    { 0, 0, 0, 2},
+    { 0, 0, 0, 3},
+    { 0, 0, 0, 4},
+    { cat1, Pcat1, 1, 5},
+    { cat2, Pcat2, 2, 7},
+    { cat3, Pcat3, 3, 11},
+    { cat4, Pcat4, 4, 19},
+    { cat5, Pcat5, 5, 35},
+    { cat6, Pcat6, 11, 67},
+    { 0, 0, 0, 0}
 };
 #include "defaultcoefcounts.h"

@@ -140,10 +169,12 @@ void vp8_default_coef_probs(VP8_COMMON *pc)

            do
            {
-                unsigned int branch_ct [vp8_coef_tokens-1] [2];
+                unsigned int branch_ct [ENTROPY_NODES] [2];
                vp8_tree_probs_from_distribution(
-                    vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
-                    pc->fc.coef_probs [h][i][k], branch_ct, default_coef_counts [h][i][k],
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    pc->fc.coef_probs[h][i][k],
+                    branch_ct,
+                    vp8_default_coef_counts[h][i][k],
                    256, 1);

            }
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -24,25 +24,23 @@
 #define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
 #define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
 #define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-26     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY4       8       /* 11-26     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY5       9       /* 27-58     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 59+       Extra Bits 11+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */

-#define vp8_coef_tokens 12
-#define MAX_ENTROPY_TOKENS vp8_coef_tokens
+#define MAX_ENTROPY_TOKENS 12
 #define ENTROPY_NODES 11

 extern const vp8_tree_index vp8_coef_tree[];

-extern struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
+extern struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];

 typedef struct
 {
    vp8_tree_p tree;
    const vp8_prob *prob;
-    vp8bc_index_t *prob_bc;
    int Len;
    int base_val;
 } vp8_extra_bit_struct;
@@ -86,15 +84,16 @@ extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #   define PREV_COEF_CONTEXTS       3

-extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[vp8_coef_tokens]);
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);

-extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];


 struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);

 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern short vp8_default_zig_zag_mask[16];
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];

--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -33,11 +33,11 @@ typedef enum
    SUBMVREF_LEFT_ABOVE_ZED
 } sumvfref_t;

-int vp8_mv_cont(const MV *l, const MV *a)
+int vp8_mv_cont(const int_mv *l, const int_mv *a)
 {
-    int lez = (l->row == 0 && l->col == 0);
-    int aez = (a->row == 0 && a->col == 0);
-    int lea = (l->row == a->row && l->col == a->col);
+    int lez = (l->as_int == 0);
+    int aez = (a->as_int == 0);
+    int lea = (l->as_int == a->as_int);

    if (lea && lez)
        return SUBMVREF_LEFT_ABOVE_ZED;
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h
@@ -25,7 +25,7 @@ extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */

 extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];

-extern int vp8_mv_cont(const MV *l, const MV *a);
+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
 #define SUBMVREF_COUNT 5
 extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];

--- a/vp8/common/entropymv.h
+++ b/vp8/common/entropymv.h
@@ -18,6 +18,8 @@ enum
 {
    mv_max  = 1023,              /* max absolute value of a MV component */
    MVvals = (2 * mv_max) + 1,   /* # possible values "" */
+    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
+    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */

    mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
    mvnum_short = 8,         /* magnitudes 0 through 7 */
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -13,10 +13,12 @@
 #include "vpx_mem/vpx_mem.h"


-static void extend_plane_borders
+static void copy_and_extend_plane
 (
    unsigned char *s, /* source */
-    int sp,           /* pitch */
+    int sp,           /* source pitch */
+    unsigned char *d, /* destination */
+    int dp,           /* destination pitch */
    int h,            /* height */
    int w,            /* width */
    int et,           /* extend top border */
@@ -25,7 +27,6 @@ static void extend_plane_borders
    int er            /* extend right border */
 )
 {
-
    int i;
    unsigned char *src_ptr1, *src_ptr2;
    unsigned char *dest_ptr1, *dest_ptr2;
@@ -34,68 +35,73 @@ static void extend_plane_borders
    /* copy the left and right most columns out */
    src_ptr1 = s;
    src_ptr2 = s + w - 1;
-    dest_ptr1 = s - el;
-    dest_ptr2 = s + w;
+    dest_ptr1 = d - el;
+    dest_ptr2 = d + w;

-    for (i = 0; i < h - 0 + 1; i++)
+    for (i = 0; i < h; i++)
    {
-        /* Some linkers will complain if we call vpx_memset with el set to a
-         * constant 0.
-         */
-        if (el)
        vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
        vpx_memset(dest_ptr2, src_ptr2[0], er);
        src_ptr1  += sp;
        src_ptr2  += sp;
-        dest_ptr1 += sp;
-        dest_ptr2 += sp;
+        dest_ptr1 += dp;
+        dest_ptr2 += dp;
    }

-    /* Now copy the top and bottom source lines into each line of the respective borders */
-    src_ptr1 = s - el;
-    src_ptr2 = s + sp * (h - 1) - el;
-    dest_ptr1 = s + sp * (-et) - el;
-    dest_ptr2 = s + sp * (h) - el;
-    linesize = el + er + w + 1;
+    /* Now copy the top and bottom lines into each line of the respective
+     * borders
+     */
+    src_ptr1 = d - el;
+    src_ptr2 = d + dp * (h - 1) - el;
+    dest_ptr1 = d + dp * (-et) - el;
+    dest_ptr2 = d + dp * (h) - el;
+    linesize = el + er + w;

-    for (i = 0; i < (int)et; i++)
+    for (i = 0; i < et; i++)
    {
        vpx_memcpy(dest_ptr1, src_ptr1, linesize);
-        dest_ptr1 += sp;
+        dest_ptr1 += dp;
    }

-    for (i = 0; i < (int)eb; i++)
+    for (i = 0; i < eb; i++)
    {
        vpx_memcpy(dest_ptr2, src_ptr2, linesize);
-        dest_ptr2 += sp;
+        dest_ptr2 += dp;
    }
 }


-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst)
 {
-    int er = 0xf & (16 - (width & 0xf));
-    int eb = 0xf & (16 - (height & 0xf));
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;

-    /* check for non multiples of 16 */
-    if (er != 0 || eb != 0)
-    {
-        extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er);
+    copy_and_extend_plane(src->y_buffer, src->y_stride,
+                          dst->y_buffer, dst->y_stride,
+                          src->y_height, src->y_width,
+                          et, el, eb, er);

-        /* adjust for uv */
-        height = (height + 1) >> 1;
-        width  = (width  + 1) >> 1;
-        er = 0x7 & (8 - (width  & 0x7));
-        eb = 0x7 & (8 - (height & 0x7));
+    et = dst->border >> 1;
+    el = dst->border >> 1;
+    eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+    er = (dst->border >> 1) + dst->uv_width - src->uv_width;

-        if (er || eb)
-        {
-            extend_plane_borders(ybf->u_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-            extend_plane_borders(ybf->v_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-        }
-    }
+    copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                          dst->u_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                          dst->v_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
 }

+
 /* note the extension is only for the last row, for intra prediction purpose */
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
 {
--- a/vp8/common/extend.h
+++ b/vp8/common/extend.h
@@ -14,8 +14,8 @@

 #include "vpx_scale/yv12config.h"

-void Extend(YV12_BUFFER_CONFIG *ybf);
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);

 #endif
--- a/vp8/common/filter_c.c
+++ b/vp8/common/filter_c.c
@@ -10,13 +10,10 @@


 #include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"

-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-
-static const int bilinear_filters[8][2] =
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
    { 128,   0 },
    { 112,  16 },
@@ -28,8 +25,7 @@ static const int bilinear_filters[8][2] =
    {  16, 112 }
 };

-
-static const short sub_pel_filters[8][6] =
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 {

    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
@@ -40,12 +36,9 @@ static const short sub_pel_filters[8][6] =
    { 0, -6,   50,   93,  -9,  0 },
    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
    { 0, -1,   12,  123,  -6,  0 },
-
-
-
 };

-void vp8_filter_block2d_first_pass
+static void filter_block2d_first_pass
 (
    unsigned char *src_ptr,
    int *output_ptr,
@@ -89,7 +82,7 @@ void vp8_filter_block2d_first_pass
    }
 }

-void vp8_filter_block2d_second_pass
+static void filter_block2d_second_pass
 (
    int *src_ptr,
    unsigned char *output_ptr,
@@ -136,7 +129,7 @@ void vp8_filter_block2d_second_pass
 }


-void vp8_filter_block2d
+static void filter_block2d
 (
    unsigned char  *src_ptr,
    unsigned char  *output_ptr,
@@ -146,42 +139,16 @@ void vp8_filter_block2d
    const short  *VFilter
 )
 {
-    int FData[9*4]; /* Temp data bufffer used in filtering */
+    int FData[9*4]; /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);

    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
 }


-void vp8_block_variation_c
-(
-    unsigned char  *src_ptr,
-    int   src_pixels_per_line,
-    int *HVar,
-    int *VVar
-)
-{
-    int i, j;
-    unsigned char *Ptr = src_ptr;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);
-            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);
-        }
-
-        Ptr += src_pixels_per_line;
-    }
-}
-
-
-
-
 void vp8_sixtap_predict_c
 (
    unsigned char  *src_ptr,
@@ -195,10 +162,10 @@ void vp8_sixtap_predict_c
    const short  *HFilter;
    const short  *VFilter;

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

-    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
 void vp8_sixtap_predict8x8_c
 (
@@ -212,17 +179,17 @@ void vp8_sixtap_predict8x8_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);


    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);

 }

@@ -238,17 +205,17 @@ void vp8_sixtap_predict8x4_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */

-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);


    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);

 }

@@ -264,17 +231,17 @@ void vp8_sixtap_predict16x16_c
 {
    const short  *HFilter;
    const short  *VFilter;
-    int FData[21*24];   /* Temp data bufffer used in filtering */
+    int FData[21*24];   /* Temp data buffer used in filtering */


-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);

    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);

 }

@@ -284,56 +251,49 @@ void vp8_sixtap_predict16x16_c
 *  ROUTINE       : filter_block2d_bil_first_pass
 *
 *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
 *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil_first_pass
+static void filter_block2d_bil_first_pass
 (
    unsigned char  *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const int *vp8_filter
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
    unsigned int i, j;

-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
            src_ptr++;
        }

        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
+        src_ptr += src_stride - width;
+        dst_ptr += width;
    }
 }

@@ -342,59 +302,50 @@ void vp8_filter_block2d_bil_first_pass
 *  ROUTINE       : filter_block2d_bil_second_pass
 *
 *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
 *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
 *
 *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
 *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil_second_pass
+static void filter_block2d_bil_second_pass
 (
    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const int *vp8_filter
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
    unsigned int  i, j;
    int  Temp;

-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
    {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
        {
            /* Apply filter */
            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
            src_ptr++;
        }

        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_pitch;
+        dst_ptr += dst_pitch;
    }
 }

@@ -404,11 +355,14 @@ void vp8_filter_block2d_bil_second_pass
 *  ROUTINE       : filter_block2d_bil
 *
 *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
 *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
 *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
 *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
 *
 *  RETURNS       : void
 *
@@ -419,26 +373,26 @@ void vp8_filter_block2d_bil_second_pass
 *  SPECIAL NOTES : The largest block size can be handled here is 16x16
 *
 ****************************************************************************/
-void vp8_filter_block2d_bil
+static void filter_block2d_bil
 (
    unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
    unsigned int   dst_pitch,
-    const int      *HFilter,
-    const int      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
    int            Width,
    int            Height
 )
 {

-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */

    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }


@@ -452,11 +406,11 @@ void vp8_bilinear_predict4x4_c
    int dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 #if 0
    {
        int i;
@@ -464,19 +418,19 @@ void vp8_bilinear_predict4x4_c
        unsigned char temp2[16];

        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);

        for (i = 0; i < 16; i++)
        {
            if (temp1[i] != temp2[i])
            {
                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
            }
        }
    }
 #endif
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

 }

@@ -490,13 +444,13 @@ void vp8_bilinear_predict8x8_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

 }

@@ -510,13 +464,13 @@ void vp8_bilinear_predict8x4_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

 }

@@ -530,11 +484,11 @@ void vp8_bilinear_predict16x16_c
    int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;

-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];

-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
--- a/vp8/common/partialgfupdate.h
+++ b/vp8/common/partialgfupdate.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
@@ -9,11 +9,14 @@
 */


-#ifndef __INC_PARTIALGFUPDATE_H
-#define __INC_PARTIALGFUPDATE_H
+#ifndef FILTER_H
+#define FILTER_H

-#include "onyxc_int.h"
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7

-extern void update_gf_selective(ONYX_COMMON *cm, MACROBLOCKD *x);
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];

-#endif
+#endif //FILTER_H
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -11,54 +11,23 @@

 #include "findnearmv.h"

-#define FINDNEAR_SEARCH_SITES   3
+const unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};

 /* Predict motion vectors using those from already-decoded nearby blocks.
   Note that we only consider one 4x4 subblock from each candidate 16x16
   macroblock.   */
-
-typedef union
-{
-    unsigned int as_int;
-    MV           as_mv;
-} int_mv;        /* facilitates rapid equality tests */
-
-static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
-{
-    MV xmv;
-    xmv = x->mbmi.mv.as_mv;
-
-    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
-    {
-        xmv.row *= -1;
-        xmv.col *= -1;
-    }
-
-    mvp->as_mv = xmv;
-}
-
-
-void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
-{
-    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-}
-
-
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
    const MODE_INFO *here,
-    MV *nearest,
-    MV *nearby,
-    MV *best_mv,
+    int_mv *nearest,
+    int_mv *nearby,
+    int_mv *best_mv,
    int cnt[4],
    int refframe,
    int *ref_frame_sign_bias
@@ -82,7 +51,7 @@ void vp8_find_near_mvs
        if (above->mbmi.mv.as_int)
        {
            (++mv)->as_int = above->mbmi.mv.as_int;
-            mv_bias(above, refframe, mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
            ++cntx;
        }

@@ -97,7 +66,7 @@ void vp8_find_near_mvs
            int_mv this_mv;

            this_mv.as_int = left->mbmi.mv.as_int;
-            mv_bias(left, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);

            if (this_mv.as_int != mv->as_int)
            {
@@ -119,7 +88,7 @@ void vp8_find_near_mvs
            int_mv this_mv;

            this_mv.as_int = aboveleft->mbmi.mv.as_int;
-            mv_bias(aboveleft, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);

            if (this_mv.as_int != mv->as_int)
            {
@@ -162,13 +131,14 @@ void vp8_find_near_mvs
        near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];

    /* Set up return values */
-    *best_mv = near_mvs[0].as_mv;
-    *nearest = near_mvs[CNT_NEAREST].as_mv;
-    *nearby = near_mvs[CNT_NEAR].as_mv;
+    best_mv->as_int = near_mvs[0].as_int;
+    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+    nearby->as_int = near_mvs[CNT_NEAR].as_int;

-    vp8_clamp_mv(nearest, xd);
-    vp8_clamp_mv(nearby, xd);
-    vp8_clamp_mv(best_mv, xd); /*TODO: move this up before the copy*/
+    //TODO: move clamp outside findnearmv
+    vp8_clamp_mv2(nearest, xd);
+    vp8_clamp_mv2(nearby, xd);
+    vp8_clamp_mv2(best_mv, xd);
 }

 vp8_prob *vp8_mv_ref_probs(
@@ -183,26 +153,3 @@ vp8_prob *vp8_mv_ref_probs(
    return p;
 }

-const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b)
-{
-    if (!(b & 3))
-    {
-        /* On L edge, get from MB to left of us */
-        --cur_mb;
-        b += 4;
-    }
-
-    return cur_mb->bmi + b - 1;
-}
-
-const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride)
-{
-    if (!(b >> 2))
-    {
-        /* On top edge, get from MB above us */
-        cur_mb -= mi_stride;
-        b += 16;
-    }
-
-    return cur_mb->bmi + b - 4;
-}
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -17,11 +17,65 @@
 #include "modecont.h"
 #include "treecoder.h"

+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    MV xmv;
+    xmv = mvp->as_mv;
+
+    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+    {
+        xmv.row *= -1;
+        xmv.col *= -1;
+    }
+
+    mvp->as_mv = xmv;
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
+{
+    if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
+                         int mb_to_top_edge, int mb_to_bottom_edge)
+{
+    mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+        mb_to_left_edge : mv->as_mv.col;
+    mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+        mb_to_right_edge : mv->as_mv.col;
+    mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+        mb_to_top_edge : mv->as_mv.row;
+    mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+        mb_to_bottom_edge : mv->as_mv.row;
+}
+static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+                                int mb_to_right_edge, int mb_to_top_edge,
+                                int mb_to_bottom_edge)
+{
+    unsigned int need_to_clamp;
+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
+    return need_to_clamp;
+}
+
 void vp8_find_near_mvs
 (
    MACROBLOCKD *xd,
    const MODE_INFO *here,
-    MV *nearest, MV *nearby, MV *best,
+    int_mv *nearest, int_mv *nearby, int_mv *best,
    int near_mv_ref_cts[4],
    int refframe,
    int *ref_frame_sign_bias
@@ -31,12 +85,89 @@ vp8_prob *vp8_mv_ref_probs(
    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 );

-const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);
+extern const unsigned char vp8_mbsplit_offset[4][16];

-const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);

-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static int left_block_mv(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;

+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 4;
+    }
+
+    return (cur_mb->bmi + b - 1)->mv.as_int;
+}
+
+static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 16;
+    }
+
+    return (cur_mb->bmi + b - 4)->mv.as_int;
+}
+static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 3)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 1)->as_mode;
+}
+
+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 12)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 4)->as_mode;
+}

 #endif
--- a/vp8/common/fourcc.hpp
+++ b/vp8/common/fourcc.hpp
@@ -1,121 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef FOURCC_HPP
-#define FOURCC_HPP
-
-#include <iosfwd>
-#include <cstring>
-
-
-#if defined(__POWERPC__) || defined(__APPLE__) || defined(__MERKS__)
-using namespace std;
-#endif
-
-class four_cc
-{
-public:
-
-    four_cc();
-    four_cc(const char*);
-    explicit four_cc(unsigned long);
-
-    bool operator==(const four_cc&) const;
-    bool operator!=(const four_cc&) const;
-
-    bool operator==(const char*) const;
-    bool operator!=(const char*) const;
-
-    operator unsigned long() const;
-    unsigned long as_long() const;
-
-    four_cc& operator=(unsigned long);
-
-    char operator[](int) const;
-
-    std::ostream& put(std::ostream&) const;
-
-    bool printable() const;
-
-private:
-
-    union
-    {
-        char code[4];
-        unsigned long code_as_long;
-    };
-
-};
-
-
-inline four_cc::four_cc()
-{
-}
-
-inline four_cc::four_cc(unsigned long x)
-    : code_as_long(x)
-{
-}
-
-inline four_cc::four_cc(const char* str)
-{
-    memcpy(code, str, 4);
-}
-
-
-inline bool four_cc::operator==(const four_cc& rhs) const
-{
-    return code_as_long == rhs.code_as_long;
-}
-
-inline bool four_cc::operator!=(const four_cc& rhs) const
-{
-    return !operator==(rhs);
-}
-
-inline bool four_cc::operator==(const char* rhs) const
-{
-    return (memcmp(code, rhs, 4) == 0);
-}
-
-inline bool four_cc::operator!=(const char* rhs) const
-{
-    return !operator==(rhs);
-}
-
-
-inline four_cc::operator unsigned long() const
-{
-    return code_as_long;
-}
-
-inline unsigned long four_cc::as_long() const
-{
-    return code_as_long;
-}
-
-inline char four_cc::operator[](int i) const
-{
-    return code[i];
-}
-
-inline four_cc& four_cc::operator=(unsigned long val)
-{
-    code_as_long = val;
-    return *this;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const four_cc& rhs)
-{
-    return rhs.put(os);
-}
-
-#endif
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -10,21 +10,60 @@


 #include "vpx_ports/config.h"
-#include "g_common.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
+#include "vp8/common/g_common.h"
+#include "vp8/common/subpixel.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#endif
+#endif

 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);

-void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+    int core_count = 16;

-void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
+#if HAVE_UNISTD_H
+#if defined(_SC_NPROCESSORS_ONLN)
+    core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+    core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+    {
+        PGNSI pGNSI;
+        SYSTEM_INFO sysinfo;
+
+        /* Call GetNativeSystemInfo if supported or
+         * GetSystemInfo otherwise. */
+
+        pGNSI = (PGNSI) GetProcAddress(
+                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+        if (pGNSI != NULL)
+            pGNSI(&sysinfo);
+        else
+            GetSystemInfo(&sysinfo);
+
+        core_count = sysinfo.dwNumberOfProcessors;
+    }
+#else
+    /* other platforms */
+#endif
+
+    return core_count > 0 ? core_count : 1;
+}
+#endif

 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
@@ -45,6 +84,16 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->recon.recon4      = vp8_recon4b_c;
    rtcd->recon.recon_mb    = vp8_recon_mb_c;
    rtcd->recon.recon_mby   = vp8_recon_mby_c;
+    rtcd->recon.build_intra_predictors_mby =
+        vp8_build_intra_predictors_mby;
+    rtcd->recon.build_intra_predictors_mby_s =
+        vp8_build_intra_predictors_mby_s;
+    rtcd->recon.build_intra_predictors_mbuv =
+        vp8_build_intra_predictors_mbuv;
+    rtcd->recon.build_intra_predictors_mbuv_s =
+        vp8_build_intra_predictors_mbuv_s;
+    rtcd->recon.intra4x4_predict =
+        vp8_intra4x4_predict;

    rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
    rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
@@ -59,23 +108,22 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
    rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
    rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;
-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
+    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c;
    rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;
-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
+    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c;
    rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;

-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
+#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)
    rtcd->postproc.down             = vp8_mbpost_proc_down_c;
    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
-    rtcd->postproc.blend_mb    = vp8_blend_mb_c;
+    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
+    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
+    rtcd->postproc.blend_b          = vp8_blend_b_c;
 #endif

 #endif
-    /* Pure C: */
-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;

 #if ARCH_X86 || ARCH_X86_64
    vp8_arch_x86_common_init(ctx);
@@ -85,4 +133,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
    vp8_arch_arm_common_init(ctx);
 #endif

+#if CONFIG_MULTITHREAD
+    ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
 }
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -9,162 +9,149 @@
 */


-#include "vpx_ports/config.h"
+#include "vpx_config.h"
 #include "loopfilter.h"
 #include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"

 typedef unsigned char uc;

-
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
+
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c);

 /* Horizontal MB filtering */
-void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }

 /* Vertical MB Filtering */
-void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }

 /* Horizontal B Filtering */
-void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }

-void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
 }

 /* Vertical B Filtering */
-void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);

    if (u_ptr)
-        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);

    if (v_ptr)
-        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }

-void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
 }

-void vp8_init_loop_filter(VP8_COMMON *cm)
+static void lf_init_lut(loop_filter_info_n *lfi)
 {
-    loop_filter_info *lfi = cm->lf_info;
-    LOOPFILTERTYPE lft = cm->filter_type;
-    int sharpness_lvl = cm->sharpness_level;
-    int frame_type = cm->frame_type;
-    int i, j;
+    int filt_lvl;

-    int block_inside_limit = 0;
-    int HEVThresh;
-    const int yhedge_boost  = 2;
-    const int uvhedge_boost = 2;
+    for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
+    {
+        if (filt_lvl >= 40)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+        }
+        else if (filt_lvl >= 20)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+        }
+        else if (filt_lvl >= 15)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+        }
+        else
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+        }
+    }

-    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
+    lfi->mode_lf_lut[DC_PRED] = 1;
+    lfi->mode_lf_lut[V_PRED] = 1;
+    lfi->mode_lf_lut[H_PRED] = 1;
+    lfi->mode_lf_lut[TM_PRED] = 1;
+    lfi->mode_lf_lut[B_PRED]  = 0;
+
+    lfi->mode_lf_lut[ZEROMV]  = 1;
+    lfi->mode_lf_lut[NEARESTMV] = 2;
+    lfi->mode_lf_lut[NEARMV] = 2;
+    lfi->mode_lf_lut[NEWMV] = 2;
+    lfi->mode_lf_lut[SPLITMV] = 3;
+
+}
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl)
+{
+    int i;
+
+    /* For each possible value for the loop filter fill out limits */
    for (i = 0; i <= MAX_LOOP_FILTER; i++)
    {
        int filt_lvl = i;
-
-        if (frame_type == KEY_FRAME)
-        {
-            if (filt_lvl >= 40)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
-        }
-        else
-        {
-            if (filt_lvl >= 40)
-                HEVThresh = 3;
-            else if (filt_lvl >= 20)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
-        }
+        int block_inside_limit = 0;

        /* Set loop filter paramaeters that control sharpness. */
        block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
@@ -179,181 +166,143 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
        if (block_inside_limit < 1)
            block_inside_limit = 1;

-        for (j = 0; j < 16; j++)
-        {
-            lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
-            lfi[i].mbthr[j] = HEVThresh;
-            lfi[i].flim[j] = filt_lvl;
-            lfi[i].thr[j] = HEVThresh;
-            lfi[i].uvlim[j] = block_inside_limit;
-            lfi[i].uvmbflim[j] = filt_lvl + uvhedge_boost;
-            lfi[i].uvmbthr[j] = HEVThresh;
-            lfi[i].uvflim[j] = filt_lvl;
-            lfi[i].uvthr[j] = HEVThresh;
-        }
-
-    }
-
-    /* Set up the function pointers depending on the type of loop filtering selected */
-    if (lft == NORMAL_LOOPFILTER)
-    {
-        cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v);
-        cm->lf_bv  = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v);
-        cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h);
-        cm->lf_bh  = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h);
-    }
-    else
-    {
-        cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v);
-        cm->lf_bv  = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v);
-        cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h);
-        cm->lf_bh  = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h);
+        vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+        vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
+                SIMD_WIDTH);
+        vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+                SIMD_WIDTH);
    }
 }

-/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
- * each frame. Check last_frame_type to skip the function most of times.
+void vp8_loop_filter_init(VP8_COMMON *cm)
+{
+    loop_filter_info_n *lfi = &cm->lf_info;
+    int i;
+
+    /* init limits for given sharpness*/
+    vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+    cm->last_sharpness_level = cm->sharpness_level;
+
+    /* init LUT for lvl  and hev thr picking */
+    lf_init_lut(lfi);
+
+    /* init hev threshold const vectors */
+    for(i = 0; i < 4 ; i++)
+    {
+        vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+    }
+}
+
+void vp8_loop_filter_frame_init(VP8_COMMON *cm,
+                                MACROBLOCKD *mbd,
+                                int default_filt_lvl)
+{
+    int seg,  /* segment number */
+        ref,  /* index in ref_lf_deltas */
+        mode; /* index in mode_lf_deltas */
+
+    loop_filter_info_n *lfi = &cm->lf_info;
+
+    /* update limits if sharpness has changed */
+    if(cm->last_sharpness_level != cm->sharpness_level)
+    {
+        vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
+    {
+        int lvl_seg = default_filt_lvl;
+        int lvl_ref, lvl_mode;
+
+        /* Note the baseline filter values for each segment */
+        if (mbd->segmentation_enabled)
+        {
+            /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            {
+                lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+            }
+            else  /* Delta Value */
+            {
+                lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+                lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
+            }
+        }
+
+        if (!mbd->mode_ref_lf_delta_enabled)
+        {
+            /* we could get rid of this if we assume that deltas are set to
+             * zero when not in use; encoder always uses deltas
             */
-void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
-{
-    int HEVThresh;
-    int i, j;
-
-    /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
-    for (i = 0; i <= MAX_LOOP_FILTER; i++)
-    {
-        int filt_lvl = i;
-
-        if (frame_type == KEY_FRAME)
-        {
-            if (filt_lvl >= 40)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
-        }
-        else
-        {
-            if (filt_lvl >= 40)
-                HEVThresh = 3;
-            else if (filt_lvl >= 20)
-                HEVThresh = 2;
-            else if (filt_lvl >= 15)
-                HEVThresh = 1;
-            else
-                HEVThresh = 0;
+            vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+            continue;
        }

-        for (j = 0; j < 16; j++)
-        {
-            /*lfi[i].lim[j] = block_inside_limit;
-            lfi[i].mbflim[j] = filt_lvl+yhedge_boost;*/
-            lfi[i].mbthr[j] = HEVThresh;
-            /*lfi[i].flim[j] = filt_lvl;*/
-            lfi[i].thr[j] = HEVThresh;
-            /*lfi[i].uvlim[j] = block_inside_limit;
-            lfi[i].uvmbflim[j] = filt_lvl+uvhedge_boost;*/
-            lfi[i].uvmbthr[j] = HEVThresh;
-            /*lfi[i].uvflim[j] = filt_lvl;*/
-            lfi[i].uvthr[j] = HEVThresh;
-        }
-    }
-}
+        lvl_ref = lvl_seg;

+        /* INTRA_FRAME */
+        ref = INTRA_FRAME;

-void vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int *filter_level)
-{
-    MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
-
-    if (mbd->mode_ref_lf_delta_enabled)
-    {
        /* Apply delta for reference frame */
-        *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+        lvl_ref += mbd->ref_lf_deltas[ref];

-        /* Apply delta for mode */
-        if (mbmi->ref_frame == INTRA_FRAME)
-        {
+        /* Apply delta for Intra modes */
+        mode = 0; /* B_PRED */
        /* Only the split mode BPRED has a further special case */
-            if (mbmi->mode == B_PRED)
-                *filter_level +=  mbd->mode_lf_deltas[0];
-        }
-        else
+        lvl_mode = lvl_ref +  mbd->mode_lf_deltas[mode];
+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        mode = 1; /* all the rest of Intra modes */
+        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        /* LAST, GOLDEN, ALT */
+        for(ref = 1; ref < MAX_REF_FRAMES; ref++)
        {
-            /* Zero motion mode */
-            if (mbmi->mode == ZEROMV)
-                *filter_level +=  mbd->mode_lf_deltas[1];
+            int lvl_ref = lvl_seg;

-            /* Split MB motion mode */
-            else if (mbmi->mode == SPLITMV)
-                *filter_level +=  mbd->mode_lf_deltas[3];
+            /* Apply delta for reference frame */
+            lvl_ref += mbd->ref_lf_deltas[ref];

-            /* All other inter motion modes (Nearest, Near, New) */
-            else
-                *filter_level +=  mbd->mode_lf_deltas[2];
+            /* Apply delta for Inter modes */
+            for (mode = 1; mode < 4; mode++)
+            {
+                lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+                lfi->lvl[seg][ref][mode] = lvl_mode;
+            }
        }
-
-        /* Range check */
-        if (*filter_level > MAX_LOOP_FILTER)
-            *filter_level = MAX_LOOP_FILTER;
-        else if (*filter_level < 0)
-            *filter_level = 0;
    }
 }

-
 void vp8_loop_filter_frame
 (
    VP8_COMMON *cm,
-    MACROBLOCKD *mbd,
-    int default_filt_lvl
+    MACROBLOCKD *mbd
 )
 {
    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-    loop_filter_info *lfi = cm->lf_info;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
    FRAME_TYPE frame_type = cm->frame_type;

    int mb_row;
    int mb_col;

-
-    int baseline_filter_level[MAX_MB_SEGMENTS];
    int filter_level;
-    int alt_flt_enabled = mbd->segmentation_enabled;

-    int i;
    unsigned char *y_ptr, *u_ptr, *v_ptr;

-    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */
-
-    /* Note the baseline filter values for each segment */
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            /* Abs value */
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            /* Delta Value */
-            else
-            {
-                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
-            }
-        }
-    }
-    else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            baseline_filter_level[i] = default_filt_lvl;
-    }
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;

    /* Initialize the loop filter for this frame. */
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);
+    vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);

    /* Set up the buffer pointers */
    y_ptr = post->y_buffer;
@@ -365,101 +314,108 @@ void vp8_loop_filter_frame
    {
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
-            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                            mode_info_context->mbmi.mode != SPLITMV &&
+                            mode_info_context->mbmi.mb_skip_coeff);

-            filter_level = baseline_filter_level[Segment];
+            const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;

-            /* Distance of Mb to the various image edges.
-             * These specified to 8th pel as they are always compared to values that are in 1/8th pel units
-             * Apply any context driven MB level adjustment
-             */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

            if (filter_level)
            {
-                if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];

-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

                    /* don't apply across umv border */
                    if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
            }

            y_ptr += 16;
            u_ptr += 8;
            v_ptr += 8;

-            mbd->mode_info_context++;     /* step to next MB */
+            mode_info_context++;     /* step to next MB */
        }

        y_ptr += post->y_stride  * 16 - post->y_width;
        u_ptr += post->uv_stride *  8 - post->uv_width;
        v_ptr += post->uv_stride *  8 - post->uv_width;

-        mbd->mode_info_context++;         /* Skip border mb */
+        mode_info_context++;         /* Skip border mb */
    }
 }

-
 void vp8_loop_filter_frame_yonly
 (
    VP8_COMMON *cm,
    MACROBLOCKD *mbd,
-    int default_filt_lvl,
-    int sharpness_lvl
+    int default_filt_lvl
 )
 {
    YV12_BUFFER_CONFIG *post = cm->frame_to_show;

-    int i;
    unsigned char *y_ptr;
    int mb_row;
    int mb_col;

-    loop_filter_info *lfi = cm->lf_info;
-    int baseline_filter_level[MAX_MB_SEGMENTS];
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
    int filter_level;
-    int alt_flt_enabled = mbd->segmentation_enabled;
    FRAME_TYPE frame_type = cm->frame_type;

-    (void) sharpness_lvl;
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;

-    /*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */
-    mbd->mode_info_context = cm->mi;          /* Point at base of Mb MODE_INFO list */
-
-    /* Note the baseline filter values for each segment */
-    if (alt_flt_enabled)
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            /* Abs value */
-            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-            /* Delta Value */
-            else
-            {
-                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
-            }
-        }
-    }
-    else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            baseline_filter_level[i] = default_filt_lvl;
-    }
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif

    /* Initialize the loop filter for this frame. */
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);

    /* Set up the buffer pointers */
    y_ptr = post->y_buffer;
@@ -469,72 +425,106 @@ void vp8_loop_filter_frame_yonly
    {
        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
        {
-            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
-            filter_level = baseline_filter_level[Segment];
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                            mode_info_context->mbmi.mode != SPLITMV &&
+                            mode_info_context->mbmi.mb_skip_coeff);

-            /* Apply any context driven MB level adjustment */
-            vp8_adjust_mb_lf_value(mbd, &filter_level);
+            const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

            if (filter_level)
            {
-                if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];

-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

                    /* don't apply across umv border */
                    if (mb_row > 0)
-                    cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
            }

            y_ptr += 16;
-            mbd->mode_info_context ++;        /* step to next MB */
+            mode_info_context ++;        /* step to next MB */

        }

        y_ptr += post->y_stride  * 16 - post->y_width;
-        mbd->mode_info_context ++;            /* Skip border mb */
+        mode_info_context ++;            /* Skip border mb */
    }

 }

-
 void vp8_loop_filter_partial_frame
 (
    VP8_COMMON *cm,
    MACROBLOCKD *mbd,
-    int default_filt_lvl,
-    int sharpness_lvl,
-    int Fraction
+    int default_filt_lvl
 )
 {
    YV12_BUFFER_CONFIG *post = cm->frame_to_show;

-    int i;
    unsigned char *y_ptr;
    int mb_row;
    int mb_col;
-    /*int mb_rows = post->y_height >> 4;*/
    int mb_cols = post->y_width  >> 4;

-    int linestocopy;
+    int linestocopy, i;
+
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;

-    loop_filter_info *lfi = cm->lf_info;
-    int baseline_filter_level[MAX_MB_SEGMENTS];
    int filter_level;
    int alt_flt_enabled = mbd->segmentation_enabled;
    FRAME_TYPE frame_type = cm->frame_type;

-    (void) sharpness_lvl;
+    const MODE_INFO *mode_info_context;

-    /*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */
-    mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);        /* Point at base of Mb MODE_INFO list */
+    int lvl_seg[MAX_MB_SEGMENTS];

-    linestocopy = (post->y_height >> (4 + Fraction));
+    mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+    /* 3 is a magic number. 4 is probably magic too */
+    linestocopy = (post->y_height >> (4 + 3));

    if (linestocopy < 1)
        linestocopy = 1;
@@ -542,32 +532,27 @@ void vp8_loop_filter_partial_frame
    linestocopy <<= 4;

    /* Note the baseline filter values for each segment */
+    /* See vp8_loop_filter_frame_init. Rather than call that for each change
+     * to default_filt_lvl, copy the relevant calculation here.
+     */
    if (alt_flt_enabled)
    {
        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-        {
-            /* Abs value */
+        {    /* Abs value */
            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-                baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            {
+                lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            }
            /* Delta Value */
            else
            {
-                baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
-                baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  /* Clamp to valid range */
+                lvl_seg[i] = default_filt_lvl
+                        + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                lvl_seg[i] = (lvl_seg[i] > 0) ?
+                        ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
            }
        }
    }
-    else
-    {
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            baseline_filter_level[i] = default_filt_lvl;
-    }
-
-    /* Initialize the loop filter for this frame. */
-    if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
-        vp8_init_loop_filter(cm);
-    else if (frame_type != cm->last_frame_type)
-        vp8_frame_init_loop_filter(lfi, frame_type);

    /* Set up the buffer pointers */
    y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
@@ -577,28 +562,64 @@ void vp8_loop_filter_partial_frame
    {
        for (mb_col = 0; mb_col < mb_cols; mb_col++)
        {
-            int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
-            filter_level = baseline_filter_level[Segment];
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                           mode_info_context->mbmi.mode != SPLITMV &&
+                           mode_info_context->mbmi.mb_skip_coeff);
+
+            if (alt_flt_enabled)
+                filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
+            else
+                filter_level = default_filt_lvl;

            if (filter_level)
            {
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
                    if (mb_col > 0)
-                    cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

-                cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);

-                if (mbd->mode_info_context->mbmi.dc_diff > 0)
-                    cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
            }

            y_ptr += 16;
-            mbd->mode_info_context += 1;      /* step to next MB */
+            mode_info_context += 1;      /* step to next MB */
        }

        y_ptr += post->y_stride  * 16 - post->y_width;
-        mbd->mode_info_context += 1;          /* Skip border mb */
+        mode_info_context += 1;          /* Skip border mb */
    }
 }
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h
@@ -13,6 +13,7 @@
 #define loopfilter_h

 #include "vpx_ports/mem.h"
+#include "vpx_config.h"

 #define MAX_LOOP_FILTER 63

@@ -22,32 +23,45 @@ typedef enum
    SIMPLE_LOOPFILTER = 1
 } LOOPFILTERTYPE;

-/* FRK
- * Need to align this structure so when it is declared and
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
 * passed it can be loaded into vector registers.
 */
 typedef struct
 {
-    DECLARE_ALIGNED(16, signed char, lim[16]);
-    DECLARE_ALIGNED(16, signed char, flim[16]);
-    DECLARE_ALIGNED(16, signed char, thr[16]);
-    DECLARE_ALIGNED(16, signed char, mbflim[16]);
-    DECLARE_ALIGNED(16, signed char, mbthr[16]);
-    DECLARE_ALIGNED(16, signed char, uvlim[16]);
-    DECLARE_ALIGNED(16, signed char, uvflim[16]);
-    DECLARE_ALIGNED(16, signed char, uvthr[16]);
-    DECLARE_ALIGNED(16, signed char, uvmbflim[16]);
-    DECLARE_ALIGNED(16, signed char, uvmbthr[16]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
+    unsigned char lvl[4][4][4];
+    unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+    unsigned char mode_lf_lut[10];
+} loop_filter_info_n;
+
+typedef struct
+{
+    const unsigned char * mblim;
+    const unsigned char * blim;
+    const unsigned char * lim;
+    const unsigned char * hev_thr;
 } loop_filter_info;


 #define prototype_loopfilter(sym) \
-    void sym(unsigned char *src, int pitch, const signed char *flimit,\
-             const signed char *limit, const signed char *thresh, int count)
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)

 #define prototype_loopfilter_block(sym) \
-    void sym(unsigned char *y, unsigned char *u, unsigned char *v,\
-             int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
+    void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+             int ystride, int uv_stride, loop_filter_info *lfi)
+
+#define prototype_simple_loopfilter(sym) \
+    void sym(unsigned char *y, int ystride, const unsigned char *blimit)

 #if ARCH_X86 || ARCH_X86_64
 #include "x86/loopfilter_x86.h"
@@ -77,38 +91,39 @@ extern prototype_loopfilter_block(vp8_lf_normal_mb_h);
 #endif
 extern prototype_loopfilter_block(vp8_lf_normal_b_h);

-
 #ifndef vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_mb_v);
+extern prototype_simple_loopfilter(vp8_lf_simple_mb_v);

 #ifndef vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_b_v);
+extern prototype_simple_loopfilter(vp8_lf_simple_b_v);

 #ifndef vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_mb_h);
+extern prototype_simple_loopfilter(vp8_lf_simple_mb_h);

 #ifndef vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_c
 #endif
-extern prototype_loopfilter_block(vp8_lf_simple_b_h);
+extern prototype_simple_loopfilter(vp8_lf_simple_b_h);

 typedef prototype_loopfilter_block((*vp8_lf_block_fn_t));
+typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t));
+
 typedef struct
 {
    vp8_lf_block_fn_t  normal_mb_v;
    vp8_lf_block_fn_t  normal_b_v;
    vp8_lf_block_fn_t  normal_mb_h;
    vp8_lf_block_fn_t  normal_b_h;
-    vp8_lf_block_fn_t  simple_mb_v;
-    vp8_lf_block_fn_t  simple_b_v;
-    vp8_lf_block_fn_t  simple_mb_h;
-    vp8_lf_block_fn_t  simple_b_h;
+    vp8_slf_block_fn_t  simple_mb_v;
+    vp8_slf_block_fn_t  simple_b_v;
+    vp8_slf_block_fn_t  simple_mb_h;
+    vp8_slf_block_fn_t  simple_b_h;
 } vp8_loopfilter_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT
@@ -121,10 +136,33 @@ typedef void loop_filter_uvfunction
 (
    unsigned char *u,   /* source pointer */
    int p,              /* pitch */
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
    unsigned char *v
 );

+/* assorted loopfilter functions which get used elsewhere */
+struct VP8Common;
+struct MacroBlockD;
+
+void vp8_loop_filter_init(struct VP8Common *cm);
+
+void vp8_loop_filter_frame_init(struct VP8Common *cm,
+                                struct MacroBlockD *mbd,
+                                int default_filt_lvl);
+
+void vp8_loop_filter_frame(struct VP8Common *cm, struct MacroBlockD *mbd);
+
+void vp8_loop_filter_partial_frame(struct VP8Common *cm,
+                                   struct MacroBlockD *mbd,
+                                   int default_filt_lvl);
+
+void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
+                                 struct MacroBlockD *mbd,
+                                 int default_filt_lvl);
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl);
+
 #endif
--- a/vp8/common/loopfilter_filters.c
+++ b/vp8/common/loopfilter_filters.c
@@ -24,8 +24,9 @@ static __inline signed char vp8_signed_char_clamp(int t)


 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
-                                     uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
+static __inline signed char vp8_filter_mask(uc limit, uc blimit,
+                                     uc p3, uc p2, uc p1, uc p0,
+                                     uc q0, uc q1, uc q2, uc q3)
 {
    signed char mask = 0;
    mask |= (abs(p3 - p2) > limit) * -1;
@@ -34,13 +35,13 @@ static __inline signed char vp8_filter_mask(signed char limit, signed char flimi
    mask |= (abs(q1 - q0) > limit) * -1;
    mask |= (abs(q2 - q1) > limit) * -1;
    mask |= (abs(q3 - q2) > limit) * -1;
-    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit) * -1;
+    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = ~mask;
    return mask;
 }

 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
+static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
 {
    signed char hev = 0;
    hev  |= (abs(p1 - p0) > thresh) * -1;
@@ -48,7 +49,8 @@ static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0,
    return hev;
 }

-static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
+static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
+        uc *op0, uc *oq0, uc *oq1)

 {
    signed char ps0, qs0;
@@ -98,9 +100,9 @@ void vp8_loop_filter_horizontal_edge_c
 (
    unsigned char *s,
    int p, /* pitch */
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
    int count
 )
 {
@@ -113,11 +115,11 @@ void vp8_loop_filter_horizontal_edge_c
     */
    do
    {
-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
                               s[0*p], s[1*p], s[2*p], s[3*p]);

-        hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);

        vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);

@@ -130,9 +132,9 @@ void vp8_loop_filter_vertical_edge_c
 (
    unsigned char *s,
    int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
    int count
 )
 {
@@ -145,10 +147,10 @@ void vp8_loop_filter_vertical_edge_c
     */
    do
    {
-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);

-        hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

        vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);

@@ -157,7 +159,7 @@ void vp8_loop_filter_vertical_edge_c
    while (++i < count * 8);
 }

-static __inline void vp8_mbfilter(signed char mask, signed char hev,
+static __inline void vp8_mbfilter(signed char mask, uc hev,
                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
 {
    signed char s, u;
@@ -216,9 +218,9 @@ void vp8_mbloop_filter_horizontal_edge_c
 (
    unsigned char *s,
    int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
    int count
 )
 {
@@ -232,11 +234,11 @@ void vp8_mbloop_filter_horizontal_edge_c
    do
    {

-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
                               s[0*p], s[1*p], s[2*p], s[3*p]);

-        hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);

        vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);

@@ -251,9 +253,9 @@ void vp8_mbloop_filter_vertical_edge_c
 (
    unsigned char *s,
    int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
    int count
 )
 {
@@ -264,10 +266,10 @@ void vp8_mbloop_filter_vertical_edge_c
    do
    {

-        mask = vp8_filter_mask(limit[i], flimit[i],
+        mask = vp8_filter_mask(limit[0], blimit[0],
                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);

-        hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

        vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);

@@ -278,13 +280,13 @@ void vp8_mbloop_filter_vertical_edge_c
 }

 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
+static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
 {
 /* Why does this cause problems for win32?
 * error C2143: syntax error : missing ';' before 'type'
 *  (void) limit;
 */
-    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= flimit * 2 + limit) * -1;
+    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
    return mask;
 }

@@ -317,47 +319,37 @@ void vp8_loop_filter_simple_horizontal_edge_c
 (
    unsigned char *s,
    int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
-    int count
+    const unsigned char *blimit
 )
 {
    signed char mask = 0;
    int i = 0;
-    (void) thresh;

    do
    {
-        /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/
-        mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
        vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
        ++s;
    }
-    while (++i < count * 8);
+    while (++i < 16);
 }

 void vp8_loop_filter_simple_vertical_edge_c
 (
    unsigned char *s,
    int p,
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh,
-    int count
+    const unsigned char *blimit
 )
 {
    signed char mask = 0;
    int i = 0;
-    (void) thresh;

    do
    {
-        /*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/
-        mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]);
+        mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
        vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
        s += p;
    }
-    while (++i < count * 8);
+    while (++i < 16);

 }
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c
@@ -17,7 +17,7 @@ typedef enum
    DEST = 1
 } BLOCKSET;

-void vp8_setup_block
+static void setup_block
 (
    BLOCKD *b,
    int mv_stride,
@@ -43,7 +43,8 @@ void vp8_setup_block

 }

-void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
+
+static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
 {
    int block;

@@ -64,16 +65,16 @@ void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)

    for (block = 0; block < 16; block++) /* y blocks */
    {
-        vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
+        setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
                        (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
    }

    for (block = 16; block < 20; block++) /* U and V blocks */
    {
-        vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
+        setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);

-        vp8_setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
+        setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
                        ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
    }
 }
@@ -124,6 +125,6 @@ void vp8_build_block_doffsets(MACROBLOCKD *x)
 {

    /* handle the destination pitch features */
-    vp8_setup_macroblock(x, DEST);
-    vp8_setup_macroblock(x, PRED);
+    setup_macroblock(x, DEST);
+    setup_macroblock(x, PRED);
 }
--- a/vp8/common/mv.h
+++ b/vp8/common/mv.h
@@ -11,6 +11,7 @@

 #ifndef __INC_MV_H
 #define __INC_MV_H
+#include "vpx/vpx_integer.h"

 typedef struct
 {
@@ -18,4 +19,10 @@ typedef struct
    short col;
 } MV;

+typedef union
+{
+    uint32_t  as_int;
+    MV        as_mv;
+} int_mv;        /* facilitates faster equality tests and copies */
+
 #endif
--- a/Show More
+++ b/Show More