Merge "Add min_tx_size variable to recursive transform block partition system" into nextgenv2

Merge "Fix the bug that PVQ commit broke dering" into nextgenv2
Merge changes Ib9428dc9,Ide04717a,If1dba7d8,I6da97880 into nextgenv2
2016-11-08 19:14:33 +00:00 · 2016-11-08 18:00:53 +00:00 · 2016-11-08 17:42:04 +00:00 · 2016-11-08 09:36:54 -08:00 · 2016-11-08 08:15:57 -08:00 · 2016-11-07 21:11:31 -08:00
1506 changed files with 259029 additions and 271987 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -1,12 +1,12 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 5.0.0
+# Generated with clang-format 3.8.1
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
+AlignEscapedNewlinesLeft: true
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
@@ -33,20 +33,12 @@ BraceWrapping:
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
@@ -54,11 +46,7 @@ Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
@@ -66,12 +54,9 @@ IncludeCategories:
    Priority:        2
  - Regex:           '.*'
    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
 IndentCaseLabels: true
 IndentWidth:     2
 IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
@@ -80,7 +65,6 @@ NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
-PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -90,9 +74,7 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
-SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 SpaceInEmptyParentheses: false
--- a/.gitignore
+++ b/.gitignore
@@ -29,40 +29,36 @@
 /examples/decode_with_drops
 /examples/decode_with_partial_drops
 /examples/example_xma
+/examples/lossless_encoder
 /examples/postproc
 /examples/resize_util
 /examples/set_maps
 /examples/simple_decoder
 /examples/simple_encoder
 /examples/twopass_encoder
-/examples/vp8_multi_resolution_encoder
-/examples/vp8cx_set_ref
-/examples/vp9cx_set_ref
-/examples/vp9_lossless_encoder
-/examples/vp9_spatial_svc_encoder
-/examples/vpx_temporal_svc_encoder
+/examples/aom_cx_set_ref
+/examples/av1_spatial_scalable_encoder
+/examples/aom_temporal_scalable_patterns
+/examples/aom_temporal_svc_encoder
 /ivfdec
 /ivfdec.dox
 /ivfenc
 /ivfenc.dox
-/libvpx.so*
-/libvpx.ver
+/libaom.so*
+/libaom.ver
 /samples.dox
 /test_intra_pred_speed
-/test_libvpx
-/tools.dox
-/tools/*.dox
-/tools/tiny_ssim
-/vp8_api1_migration.dox
-/vp[89x]_rtcd.h
-/vpx.pc
-/vpx_config.c
-/vpx_config.h
-/vpx_dsp_rtcd.h
-/vpx_scale_rtcd.h
-/vpx_version.h
-/vpxdec
-/vpxdec.dox
-/vpxenc
-/vpxenc.dox
+/test_libaom
+/aom_api1_migration.dox
+/av1_rtcd.h
+/aom.pc
+/aom_config.c
+/aom_config.h
+/aom_dsp_rtcd.h
+/aom_scale_rtcd.h
+/aom_version.h
+/aomdec
+/aomdec.dox
+/aomenc
+/aomenc.dox
 TAGS
--- a/.mailmap
+++ b/.mailmap
@@ -3,8 +3,6 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Chris Cunningham <chcunningham@chromium.org>
-Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
@@ -15,28 +13,20 @@ Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
-Peter Boström <pbos@chromium.org> <pbos@google.com>
-Peter de Rivaz <peter.derivaz@gmail.com>
-Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
-Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
-Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
+Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
-Urvang Joshi <urvang@google.com> <urvang@chromium.org>
-Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
-Yaowu Xu <yaowu@google.com> <Yaowu Xu>
--- a/46
+++ b/46
@@ -3,13 +3,11 @@

 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
-Aleksey Vasenev <margtu-fivt@ya.ru>
-Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -17,7 +15,6 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
-Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@@ -25,14 +22,10 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
-Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
-Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
-Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
-Deepa K G <deepa.kg@ittiam.com>
 Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
@@ -43,21 +36,17 @@ Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
-Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
-Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
-Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
-Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
 Jacky Chen <jackychen@google.com>
@@ -67,16 +56,16 @@ James Zern <jzern@google.com>
 Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
-Jean-Yves Avenard <jyavenard@mozilla.com>
+Jean-Marc Valin <jmvalin@jmvalin.ca>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
-Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
 Joey Parrish <joeyparrish@google.com>
+Johann Koenig <johannkoenig@chromium.org>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
@@ -86,11 +75,8 @@ Joshua Litt <joshualitt@google.com>
 Julia Robson <juliamrobson@gmail.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
-Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
-Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
-Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
@@ -104,12 +90,9 @@ Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
-Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
-Min Ye <yeemmi@google.com>
-Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
-Nathan E. Egge <negge@mozilla.com>
+Nathan E. Egge <negge@dgql.org>
 Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -118,22 +101,17 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@chromium.org>
-Peter Collingbourne <pcc@chromium.org>
+Peter de Rivaz <peter.derivaz@argondesign.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
-Rafael de Lucena Valle <rafaeldelucena@gmail.com>
-Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
-Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
-Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
@@ -141,30 +119,26 @@ Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
-Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
+Steinar Midtskogen <stemidts@cisco.com>
 Suman Sunkara <sunkaras@google.com>
-Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
 Tao Bai <michaelbai@chromium.org>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
+Thomas Daede <tdaede@mozilla.com>
+Thomas Davies <thdavies@cisco.com>
+Thomas <thdavies@cisco.com>
 Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
-Urvang Joshi <urvang@google.com>
+Tristan Matthews <tmatth@videolan.org>
 Vignesh Venkatasubramanian <vigneshv@google.com>
-Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
-Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
 Yunqing Wang <yunqingwang@google.com>
-Yury Gitman <yuryg@google.com>
 Zoe Liu <zoeliu@google.com>
-Google Inc.
-The Mozilla Foundation
-The Xiph.Org Foundation
--- a/75
+++ b/75
@@ -1,74 +1,9 @@
-2017-01-04 v1.7.0 "Mandarin Duck"
-  This release focused on high bit depth performance (10/12 bit) and vp9
-  encoding improvements.
-
-  - Upgrading:
-    This release is ABI incompatible due to new vp9 encoder features.
-
-    Frame parallel decoding for vp9 has been removed.
-
-  - Enhancements:
-    vp9 encoding supports additional threads with --row-mt. This can be greater
-    than the number of tiles.
-
-    Two new vp9 encoder options have been added:
-      --corpus-complexity
-      --tune-content=film
-
-    Additional tooling for respecting the vp9 "level" profiles has been added.
-
-  - Bug fixes:
-    A variety of fuzzing issues.
-    vp8 threading fix for ARM.
-    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
-    Reject invalid multi resolution configurations.
-
-2017-01-09 v1.6.1 "Long Tailed Duck"
-  This release improves upon the VP9 encoder and speeds up the encoding and
-  decoding processes.
-
-  - Upgrading:
-    This release is ABI compatible with 1.6.0.
-
-  - Enhancements:
-    Faster VP9 encoding and decoding.
-    High bit depth builds now provide similar speed for 8 bit encode and decode
-    for x86 targets. Other platforms and higher bit depth improvements are in
-    progress.
-
-  - Bug Fixes:
-    A variety of fuzzing issues.
-
-2016-07-20 v1.6.0 "Khaki Campbell Duck"
-  This release improves upon the VP9 encoder and speeds up the encoding and
-  decoding processes.
-
-  - Upgrading:
-    This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum
-    in vpx_image and some minor changes to the VP8_COMP structure.
-
-    The default key frame interval for VP9 has changed from 128 to 9999.
-
-  - Enhancement:
-    A core focus has been performance for low end Intel processors. SSSE3
-    instructions such as 'pshufb' have been avoided and instructions have been
-    reordered to better accommodate the more constrained pipelines.
-
-    As a result, devices based on Celeron processors have seen substantial
-    decoding improvements. From Indian Runner Duck to Javan Whistling Duck,
-    decoding speed improved between 10 and 30%. Between Javan Whistling Duck
-    and Khaki Campbell Duck, it improved another 10 to 15%.
-
-    While Celeron benefited most, Core-i5 also improved 5% and 10% between the
-    respective releases.
-
-    Realtime performance for WebRTC for both speed and quality has received a
-    lot of attention.
-
-  - Bug Fixes:
-    A number of fuzzing issues, found variously by Mozilla, Chromium and others,
-    have been fixed and we strongly recommend updating.
+Next Release
+  - Incompatible changes:
+    The AV1 encoder's default keyframe interval changed to 128 from 9999.

+2016-04-07 v0.1.0 "AOMedia Codec 1"
+  This release is the first Alliance for Open Media codec.
 2015-11-09 v1.5.0 "Javan Whistling Duck"
  This release improves upon the VP9 encoder and speeds up the encoding and
  decoding processes.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,270 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+cmake_minimum_required(VERSION 3.2)
+project(AOM C CXX)
+
+set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
+
+set(AOM_SRCS
+    "${AOM_CONFIG_DIR}/aom_config.c"
+    "${AOM_CONFIG_DIR}/aom_config.h"
+    "${AOM_ROOT}/aom/aom.h"
+    "${AOM_ROOT}/aom/aom_codec.h"
+    "${AOM_ROOT}/aom/aom_decoder.h"
+    "${AOM_ROOT}/aom/aom_encoder.h"
+    "${AOM_ROOT}/aom/aom_frame_buffer.h"
+    "${AOM_ROOT}/aom/aom_image.h"
+    "${AOM_ROOT}/aom/aom_integer.h"
+    "${AOM_ROOT}/aom/aomcx.h"
+    "${AOM_ROOT}/aom/aomdx.h"
+    "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+    "${AOM_ROOT}/aom/src/aom_codec.c"
+    "${AOM_ROOT}/aom/src/aom_decoder.c"
+    "${AOM_ROOT}/aom/src/aom_encoder.c"
+    "${AOM_ROOT}/aom/src/aom_image.c")
+
+set(AOM_DSP_SRCS
+    "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+    "${AOM_ROOT}/aom_dsp/aom_convolve.h"
+    "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+    "${AOM_ROOT}/aom_dsp/aom_filter.h"
+    "${AOM_ROOT}/aom_dsp/aom_simd.c"
+    "${AOM_ROOT}/aom_dsp/aom_simd.h"
+    "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+    "${AOM_ROOT}/aom_dsp/avg.c"
+    "${AOM_ROOT}/aom_dsp/bitreader.h"
+    "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+    "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+    "${AOM_ROOT}/aom_dsp/bitwriter.h"
+    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+    "${AOM_ROOT}/aom_dsp/blend.h"
+    "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+    "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+    "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+    "${AOM_ROOT}/aom_dsp/dkboolreader.c"
+    "${AOM_ROOT}/aom_dsp/dkboolreader.h"
+    "${AOM_ROOT}/aom_dsp/dkboolwriter.c"
+    "${AOM_ROOT}/aom_dsp/dkboolwriter.h"
+    "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+    "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
+    "${AOM_ROOT}/aom_dsp/intrapred.c"
+    "${AOM_ROOT}/aom_dsp/inv_txfm.c"
+    "${AOM_ROOT}/aom_dsp/inv_txfm.h"
+    "${AOM_ROOT}/aom_dsp/loopfilter.c"
+    "${AOM_ROOT}/aom_dsp/prob.c"
+    "${AOM_ROOT}/aom_dsp/prob.h"
+    "${AOM_ROOT}/aom_dsp/psnr.c"
+    "${AOM_ROOT}/aom_dsp/psnr.h"
+    "${AOM_ROOT}/aom_dsp/quantize.c"
+    "${AOM_ROOT}/aom_dsp/quantize.h"
+    "${AOM_ROOT}/aom_dsp/sad.c"
+    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+    "${AOM_ROOT}/aom_dsp/subtract.c"
+    "${AOM_ROOT}/aom_dsp/txfm_common.h"
+    "${AOM_ROOT}/aom_dsp/variance.c"
+    "${AOM_ROOT}/aom_dsp/variance.h")
+
+set(AOM_MEM_SRCS
+    "${AOM_ROOT}/aom_mem/aom_mem.c"
+    "${AOM_ROOT}/aom_mem/aom_mem.h"
+    "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
+
+set(AOM_SCALE_SRCS
+    "${AOM_ROOT}/aom_scale/aom_scale.h"
+    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+    "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
+    "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
+    "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+    "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
+    "${AOM_ROOT}/aom_scale/yv12config.h")
+
+# TODO(tomfinegan): Extract aom_ports from aom_util if possible.
+set(AOM_UTIL_SRCS
+    "${AOM_ROOT}/aom_ports/aom_once.h"
+    "${AOM_ROOT}/aom_ports/aom_timer.h"
+    "${AOM_ROOT}/aom_ports/bitops.h"
+    "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
+    "${AOM_ROOT}/aom_ports/mem.h"
+    "${AOM_ROOT}/aom_ports/mem_ops.h"
+    "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
+    "${AOM_ROOT}/aom_ports/msvc.h"
+    "${AOM_ROOT}/aom_ports/system_state.h"
+    "${AOM_ROOT}/aom_util/aom_thread.c"
+    "${AOM_ROOT}/aom_util/aom_thread.h"
+    "${AOM_ROOT}/aom_util/endian_inl.h")
+
+set(AOM_AV1_COMMON_SRCS
+    "${AOM_ROOT}/av1/av1_iface_common.h"
+    "${AOM_ROOT}/av1/common/alloccommon.c"
+    "${AOM_ROOT}/av1/common/alloccommon.h"
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm.c"
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm.h"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm.c"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm.h"
+    "${AOM_ROOT}/av1/common/av1_rtcd.c"
+    "${AOM_ROOT}/av1/common/blockd.c"
+    "${AOM_ROOT}/av1/common/blockd.h"
+    "${AOM_ROOT}/av1/common/common.h"
+    "${AOM_ROOT}/av1/common/common_data.h"
+    "${AOM_ROOT}/av1/common/convolve.c"
+    "${AOM_ROOT}/av1/common/convolve.h"
+    "${AOM_ROOT}/av1/common/debugmodes.c"
+    "${AOM_ROOT}/av1/common/entropy.c"
+    "${AOM_ROOT}/av1/common/entropy.h"
+    "${AOM_ROOT}/av1/common/entropymode.c"
+    "${AOM_ROOT}/av1/common/entropymode.h"
+    "${AOM_ROOT}/av1/common/entropymv.c"
+    "${AOM_ROOT}/av1/common/entropymv.h"
+    "${AOM_ROOT}/av1/common/enums.h"
+    "${AOM_ROOT}/av1/common/filter.c"
+    "${AOM_ROOT}/av1/common/filter.h"
+    "${AOM_ROOT}/av1/common/frame_buffers.c"
+    "${AOM_ROOT}/av1/common/frame_buffers.h"
+    "${AOM_ROOT}/av1/common/idct.c"
+    "${AOM_ROOT}/av1/common/idct.h"
+    "${AOM_ROOT}/av1/common/loopfilter.c"
+    "${AOM_ROOT}/av1/common/loopfilter.h"
+    "${AOM_ROOT}/av1/common/mv.h"
+    "${AOM_ROOT}/av1/common/mvref_common.c"
+    "${AOM_ROOT}/av1/common/mvref_common.h"
+    "${AOM_ROOT}/av1/common/odintrin.c"
+    "${AOM_ROOT}/av1/common/odintrin.h"
+    "${AOM_ROOT}/av1/common/onyxc_int.h"
+    "${AOM_ROOT}/av1/common/pred_common.c"
+    "${AOM_ROOT}/av1/common/pred_common.h"
+    "${AOM_ROOT}/av1/common/quant_common.c"
+    "${AOM_ROOT}/av1/common/quant_common.h"
+    "${AOM_ROOT}/av1/common/reconinter.c"
+    "${AOM_ROOT}/av1/common/reconinter.h"
+    "${AOM_ROOT}/av1/common/reconintra.c"
+    "${AOM_ROOT}/av1/common/reconintra.h"
+    "${AOM_ROOT}/av1/common/scale.c"
+    "${AOM_ROOT}/av1/common/scale.h"
+    "${AOM_ROOT}/av1/common/scan.c"
+    "${AOM_ROOT}/av1/common/scan.h"
+    "${AOM_ROOT}/av1/common/seg_common.c"
+    "${AOM_ROOT}/av1/common/seg_common.h"
+    "${AOM_ROOT}/av1/common/thread_common.c"
+    "${AOM_ROOT}/av1/common/thread_common.h"
+    "${AOM_ROOT}/av1/common/tile_common.c"
+    "${AOM_ROOT}/av1/common/tile_common.h")
+
+set(AOM_AV1_DECODER_SRCS
+    "${AOM_ROOT}/av1/av1_dx_iface.c"
+    "${AOM_ROOT}/av1/decoder/decodeframe.c"
+    "${AOM_ROOT}/av1/decoder/decodeframe.h"
+    "${AOM_ROOT}/av1/decoder/decodemv.c"
+    "${AOM_ROOT}/av1/decoder/decodemv.h"
+    "${AOM_ROOT}/av1/decoder/decoder.c"
+    "${AOM_ROOT}/av1/decoder/decoder.h"
+    "${AOM_ROOT}/av1/decoder/detokenize.c"
+    "${AOM_ROOT}/av1/decoder/detokenize.h"
+    "${AOM_ROOT}/av1/decoder/dsubexp.c"
+    "${AOM_ROOT}/av1/decoder/dsubexp.h"
+    "${AOM_ROOT}/av1/decoder/dthread.c"
+    "${AOM_ROOT}/av1/decoder/dthread.h")
+
+set(AOM_AV1_ENCODER_SRCS
+    "${AOM_ROOT}/av1/av1_cx_iface.c"
+    "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+    "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+    "${AOM_ROOT}/av1/encoder/aq_variance.c"
+    "${AOM_ROOT}/av1/encoder/aq_variance.h"
+    "${AOM_ROOT}/av1/encoder/bitstream.c"
+    "${AOM_ROOT}/av1/encoder/bitstream.h"
+    "${AOM_ROOT}/av1/encoder/block.h"
+    "${AOM_ROOT}/av1/encoder/context_tree.c"
+    "${AOM_ROOT}/av1/encoder/context_tree.h"
+    "${AOM_ROOT}/av1/encoder/cost.c"
+    "${AOM_ROOT}/av1/encoder/cost.h"
+    "${AOM_ROOT}/av1/encoder/dct.c"
+    "${AOM_ROOT}/av1/encoder/encodeframe.c"
+    "${AOM_ROOT}/av1/encoder/encodeframe.h"
+    "${AOM_ROOT}/av1/encoder/encodemb.c"
+    "${AOM_ROOT}/av1/encoder/encodemb.h"
+    "${AOM_ROOT}/av1/encoder/encodemv.c"
+    "${AOM_ROOT}/av1/encoder/encodemv.h"
+    "${AOM_ROOT}/av1/encoder/encoder.c"
+    "${AOM_ROOT}/av1/encoder/encoder.h"
+    "${AOM_ROOT}/av1/encoder/ethread.c"
+    "${AOM_ROOT}/av1/encoder/ethread.h"
+    "${AOM_ROOT}/av1/encoder/extend.c"
+    "${AOM_ROOT}/av1/encoder/extend.h"
+    "${AOM_ROOT}/av1/encoder/firstpass.c"
+    "${AOM_ROOT}/av1/encoder/firstpass.h"
+    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+    "${AOM_ROOT}/av1/encoder/lookahead.c"
+    "${AOM_ROOT}/av1/encoder/lookahead.h"
+    "${AOM_ROOT}/av1/encoder/mbgraph.c"
+    "${AOM_ROOT}/av1/encoder/mbgraph.h"
+    "${AOM_ROOT}/av1/encoder/mcomp.c"
+    "${AOM_ROOT}/av1/encoder/mcomp.h"
+    "${AOM_ROOT}/av1/encoder/picklpf.c"
+    "${AOM_ROOT}/av1/encoder/picklpf.h"
+    "${AOM_ROOT}/av1/encoder/quantize.c"
+    "${AOM_ROOT}/av1/encoder/quantize.h"
+    "${AOM_ROOT}/av1/encoder/ratectrl.c"
+    "${AOM_ROOT}/av1/encoder/ratectrl.h"
+    "${AOM_ROOT}/av1/encoder/rd.c"
+    "${AOM_ROOT}/av1/encoder/rd.h"
+    "${AOM_ROOT}/av1/encoder/rdopt.c"
+    "${AOM_ROOT}/av1/encoder/rdopt.h"
+    "${AOM_ROOT}/av1/encoder/resize.c"
+    "${AOM_ROOT}/av1/encoder/resize.h"
+    "${AOM_ROOT}/av1/encoder/segmentation.c"
+    "${AOM_ROOT}/av1/encoder/segmentation.h"
+    "${AOM_ROOT}/av1/encoder/speed_features.c"
+    "${AOM_ROOT}/av1/encoder/speed_features.h"
+    "${AOM_ROOT}/av1/encoder/subexp.c"
+    "${AOM_ROOT}/av1/encoder/subexp.h"
+    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+    "${AOM_ROOT}/av1/encoder/tokenize.c"
+    "${AOM_ROOT}/av1/encoder/tokenize.h"
+    "${AOM_ROOT}/av1/encoder/treewriter.c"
+    "${AOM_ROOT}/av1/encoder/treewriter.h")
+
+# Targets
+add_library(aom_dsp ${AOM_DSP_SRCS})
+include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
+add_library(aom_mem ${AOM_MEM_SRCS})
+add_library(aom_scale ${AOM_SCALE_SRCS})
+include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
+add_library(aom_util ${AOM_UTIL_SRCS})
+add_library(aom_av1_decoder ${AOM_AV1_DECODER_SRCS})
+add_library(aom_av1_encoder ${AOM_AV1_ENCODER_SRCS})
+add_library(aom ${AOM_SRCS})
+target_link_libraries(aom LINK_PUBLIC
+                      aom_dsp
+                      aom_mem
+                      aom_scale
+                      aom_util
+                      aom_av1_decoder
+                      aom_av1_encoder)
+add_executable(simple_decoder examples/simple_decoder.c)
+include_directories(${AOM_ROOT})
+target_link_libraries(simple_decoder LINK_PUBLIC aom)
+add_executable(simple_encoder examples/simple_encoder.c)
+include_directories(${AOM_ROOT})
+target_link_libraries(simple_encoder LINK_PUBLIC aom)
+
--- a/34
+++ b/34
@@ -1,31 +1,27 @@
-Copyright (c) 2010, The WebM Project authors. All rights reserved.
+Copyright (c) 2016, Alliance for Open Media. All rights reserved.

 Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
+modification, are permitted provided that the following conditions
+are met:

-  * Redistributions of source code must retain the above copyright
+1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.

-  * Redistributions in binary form must reproduce the above copyright
+2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in
   the documentation and/or other materials provided with the
   distribution.

-  * Neither the name of Google, nor the WebM Project, nor the names
-    of its contributors may be used to endorse or promote products
-    derived from this software without specific prior written
-    permission.
-
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.

--- a/127
+++ b/127
@@ -1,23 +1,108 @@
-Additional IP Rights Grant (Patents)
------------------------------------
+Alliance for Open Media Patent License 1.0

-"These implementations" means the copyrightable works that implement the WebM
-codecs distributed by Google as part of the WebM Project.
+1. License Terms.
+
+1.1. Patent License. Subject to the terms and conditions of this License, each
+     Licensor, on behalf of itself and successors in interest and assigns,
+     grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
+     no-charge, royalty-free, irrevocable (except as expressly stated in this
+     License) patent license to its Necessary Claims to make, use, sell, offer
+     for sale, import or distribute any Implementation.
+
+1.2. Conditions.
+
+1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
+       sell, offer for sale, import or distribute an Implementation under
+       Section 1.1, Licensee must make its Necessary Claims available under
+       this License, and must reproduce this License with any Implementation
+       as follows:
+
+       a. For distribution in source code, by including this License in the
+          root directory of the source code with its Implementation.
+
+       b. For distribution in any other form (including binary, object form,
+          and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
+          GDSII, etc.)), by including this License in the documentation, legal
+          notices, and/or other written materials provided with the
+          Implementation.
+
+1.2.2. Additional Conditions. This license is directly from Licensor to
+       Licensee.  Licensee acknowledges as a condition of benefiting from it
+       that no rights from Licensor are received from suppliers, distributors,
+       or otherwise in connection with this License.
+
+1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
+     initiates patent litigation or files, maintains, or voluntarily
+     participates in a lawsuit against another entity or any person asserting
+     that any Implementation infringes Necessary Claims, any patent licenses
+     granted under this License directly to the Licensee are immediately
+     terminated as of the date of the initiation of action unless 1) that suit
+     was in response to a corresponding suit regarding an Implementation first
+     brought against an initiating entity, or 2) that suit was brought to
+     enforce the terms of this License (including intervention in a third-party
+     action by a Licensee).
+
+1.4. Disclaimers. The Reference Implementation and Specification are provided
+     "AS IS" and without warranty. The entire risk as to implementing or
+     otherwise using the Reference Implementation or Specification is assumed
+     by the implementer and user. Licensor expressly disclaims any warranties
+     (express, implied, or otherwise), including implied warranties of
+     merchantability, non-infringement, fitness for a particular purpose, or
+     title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
+     ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
+     INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
+     ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
+     OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
+     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. Definitions.
+
+2.1. Affiliate.  <20>Affiliate<74> means an entity that directly or indirectly
+     Controls, is Controlled by, or is under common Control of that party.
+
+2.2. Control. <20>Control<6F> means direct or indirect control of more than 50% of
+     the voting power to elect directors of that corporation, or for any other
+     entity, the power to direct management of such entity.
+
+2.3. Decoder.  "Decoder" means any decoder that conforms fully with all
+     non-optional portions of the Specification.
+
+2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
+     be decoded by a Decoder only to the extent it produces such a bitstream.
+
+2.5. Final Deliverable.  <20>Final Deliverable<6C> means the final version of a
+     deliverable approved by the Alliance for Open Media as a Final
+     Deliverable.
+
+2.6. Implementation.  "Implementation" means any implementation, including the
+     Reference Implementation, that is an Encoder and/or a Decoder. An
+     Implementation also includes components of an Implementation only to the
+     extent they are used as part of an Implementation.
+
+2.7. License. <20>License<73> means this license.
+
+2.8. Licensee. <20>Licensee<65> means any person or entity who exercises patent
+     rights granted under this License.
+
+2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
+     for sale, imports or distributes any Implementation, or (ii) a person
+     or entity that has a licensing obligation to the Implementation as a
+     result of its membership and/or participation in the Alliance for Open
+     Media working group that developed the Specification.
+
+2.10. Necessary Claims.  "Necessary Claims" means all claims of patents or
+      patent applications, (a) that currently or at any time in the future,
+      are owned or controlled by the Licensor, and (b) (i) would be an
+      Essential Claim as defined by the W3C Policy as of February 5, 2004
+      (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
+      as if the Specification was a W3C Recommendation; or (ii) are infringed
+      by the Reference Implementation.
+
+2.11. Reference Implementation. <20>Reference Implementation<6F> means an Encoder
+      and/or Decoder released by the Alliance for Open Media as a Final
+      Deliverable.
+
+2.12. Specification. <20>Specification<6F> means the specification designated by
+      the Alliance for Open Media as a Final Deliverable for which this
+      License was issued.

-Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
-royalty-free, irrevocable (except as stated in this section) patent license to
-make, have made, use, offer to sell, sell, import, transfer, and otherwise
-run, modify and propagate the contents of these implementations of WebM, where
-such license applies only to those patent claims, both currently owned by
-Google and acquired in the future, licensable by Google that are necessarily
-infringed by these implementations of WebM. This grant does not include claims
-that would be infringed only as a consequence of further modification of these
-implementations. If you or your agent or exclusive licensee institute or order
-or agree to the institution of patent litigation or any other patent
-enforcement activity against any entity (including a cross-claim or
-counterclaim in a lawsuit) alleging that any of these implementations of WebM
-or any code incorporated within any of these implementations of WebM
-constitute direct or contributory patent infringement, or inducement of
-patent infringement, then any patent rights granted to you under this License
-for these implementations of WebM shall terminate as of the date such
-litigation is filed.
--- a/68
+++ b/68
@@ -1,6 +1,6 @@
-README - 24 January 2018
+README - 23 March 2015

-Welcome to the WebM VP8/VP9 Codec SDK!
+Welcome to the WebM VP8/AV1 Codec SDK!

 COMPILING THE APPLICATIONS/LIBRARIES:
  The build system used is similar to autotools. Building generally consists of
@@ -9,26 +9,22 @@ COMPILING THE APPLICATIONS/LIBRARIES:

  1. Prerequisites

-    * All x86 targets require the Yasm[1] assembler be installed[2].
-    * All Windows builds require that Cygwin[3] be installed.
-    * Building the documentation requires Doxygen[4]. If you do not
+    * All x86 targets require the Yasm[1] assembler be installed.
+    * All Windows builds require that Cygwin[2] be installed.
+    * Building the documentation requires Doxygen[3]. If you do not
      have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[5] and sha1sum.
+    * Downloading the data for the unit tests requires curl[4] and sha1sum.
      sha1sum is provided via the GNU coreutils, installed by default on
      many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
      available, a compatible version of sha1sum can be built from
-      source[6]. These requirements are optional if not running the unit
+      source[5]. These requirements are optional if not running the unit
      tests.

    [1]: http://www.tortall.net/projects/yasm
-    [2]: For Visual Studio the base yasm binary (not vsyasm) should be in the
-         PATH for Visual Studio. For VS2017 it is sufficient to rename
-         yasm-<version>-<arch>.exe to yasm.exe and place it in:
-         Program Files (x86)/Microsoft Visual Studio/2017/<level>/Common7/Tools/
-    [3]: http://www.cygwin.com
-    [4]: http://www.doxygen.org
-    [5]: http://curl.haxx.se
-    [6]: http://www.microbrew.org/tools/md5sha1sum/
+    [2]: http://www.cygwin.com
+    [3]: http://www.doxygen.org
+    [4]: http://curl.haxx.se
+    [5]: http://www.microbrew.org/tools/md5sha1sum/

  2. Out-of-tree builds
  Out of tree builds are a supported method of building the application. For
@@ -37,32 +33,24 @@ COMPILING THE APPLICATIONS/LIBRARIES:

    $ mkdir build
    $ cd build
-    $ ../libvpx/configure <options>
+    $ ../libaom/configure <options>
    $ make

  3. Configuration options
  The 'configure' script supports a number of options. The --help option can be
  used to get a list of supported options:
-    $ ../libvpx/configure --help
+    $ ../libaom/configure --help

-  4. Compiler analyzers
-  Compilers have added sanitizers which instrument binaries with information
-  about address calculation, memory usage, threading, undefined behavior, and
-  other common errors. To simplify building libvpx with some of these features
-  use tools/set_analyzer_env.sh before running configure. It will set the
-  compiler and necessary flags for building as well as environment variables
-  read by the analyzer when testing the binaries.
-    $ source ../libvpx/tools/set_analyzer_env.sh address
-
-  5. Cross development
+  4. Cross development
  For cross development, the most notable option is the --target option. The
  most up-to-date list of supported targets can be found at the bottom of the
  --help output of the configure script. As of this writing, the list of
  available targets is:

-    arm64-android-gcc
+    armv6-linux-rvct
+    armv6-linux-gcc
+    armv6-none-rvct
    arm64-darwin-gcc
-    arm64-linux-gcc
    armv7-android-gcc
    armv7-darwin-gcc
    armv7-linux-rvct
@@ -71,13 +59,9 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv7-win32-vs11
    armv7-win32-vs12
    armv7-win32-vs14
-    armv7-win32-vs15
    armv7s-darwin-gcc
-    armv8-linux-gcc
    mips32-linux-gcc
    mips64-linux-gcc
-    ppc64-linux-gcc
-    ppc64le-linux-gcc
    sparc-solaris-gcc
    x86-android-gcc
    x86-darwin8-gcc
@@ -89,8 +73,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-darwin12-gcc
    x86-darwin13-gcc
    x86-darwin14-gcc
-    x86-darwin15-gcc
-    x86-darwin16-gcc
    x86-iphonesimulator-gcc
    x86-linux-gcc
    x86-linux-icc
@@ -101,7 +83,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-win32-vs11
    x86-win32-vs12
    x86-win32-vs14
-    x86-win32-vs15
    x86_64-android-gcc
    x86_64-darwin9-gcc
    x86_64-darwin10-gcc
@@ -109,8 +90,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-darwin12-gcc
    x86_64-darwin13-gcc
    x86_64-darwin14-gcc
-    x86_64-darwin15-gcc
-    x86_64-darwin16-gcc
    x86_64-iphonesimulator-gcc
    x86_64-linux-gcc
    x86_64-linux-icc
@@ -120,7 +99,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-win64-vs11
    x86_64-win64-vs12
    x86_64-win64-vs14
-    x86_64-win64-vs15
    generic-gnu

  The generic-gnu target, in conjunction with the CROSS environment variable,
@@ -130,24 +108,24 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  toolchain, the following command could be used (note, POSIX SH syntax, adapt
  to your shell as necessary):

-    $ CROSS=mipsel-linux-uclibc- ../libvpx/configure
+    $ CROSS=mipsel-linux-uclibc- ../libaom/configure

  In addition, the executables to be invoked can be overridden by specifying the
  environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
  passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS.

-  6. Configuration errors
+  5. Configuration errors
  If the configuration step fails, the first step is to look in the error log.
  This defaults to config.log. This should give a good indication of what went
  wrong. If not, contact us for support.

-VP8/VP9 TEST VECTORS:
+VP8/AV1 TEST VECTORS:
  The test vectors can be downloaded and verified using the build system after
  running configure. To specify an alternate directory the
-  LIBVPX_TEST_DATA_PATH environment variable can be used.
+  LIBAOM_TEST_DATA_PATH environment variable can be used.

  $ ./configure --enable-unit-tests
-  $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata
+  $ LIBAOM_TEST_DATA_PATH=../-test-data make testdata

 CODE STYLE:
  The coding style used by this project is enforced with clang-format using the
@@ -166,5 +144,5 @@ CODE STYLE:

 SUPPORT
  This library is an open source project supported by its community. Please
-  email webm-discuss@webmproject.org for help.
+  please email webm-discuss@webmproject.org for help.

--- a/aom/aom.h
+++ b/aom/aom.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom AOM
+ * \ingroup codecs
+ * AOM is aom's newest video compression algorithm that uses motion
+ * compensated prediction, Discrete Cosine Transform (DCT) coding of the
+ * prediction error signal and context dependent entropy coding techniques
+ * based on arithmetic principles. It features:
+ *  - YUV 4:2:0 image format
+ *  - Macro-block based coding (16x16 luma plus two 8x8 chroma)
+ *  - 1/4 (1/8) pixel accuracy motion compensated prediction
+ *  - 4x4 DCT transform
+ *  - 128 level linear quantizer
+ *  - In loop deblocking filter
+ *  - Context-based entropy coding
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides controls common to both the AOM encoder and decoder.
+ */
+#ifndef AOM_AOM_H_
+#define AOM_AOM_H_
+
+#include "./aom_codec.h"
+#include "./aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Control functions
+ *
+ * The set of macros define the control functions of AOM interface
+ */
+enum aom_com_control_id {
+  /*!\brief pass in an external frame into decoder to be used as reference frame
+   */
+  AOM_SET_REFERENCE = 1,
+  AOM_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
+  AOM_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
+  AOM_SET_DBG_COLOR_REF_FRAME =
+      4, /**< set the reference frames to color for each macroblock */
+  AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
+  AOM_SET_DBG_COLOR_B_MODES = 6,  /**< set which blocks modes to color */
+  AOM_SET_DBG_DISPLAY_MV = 7,     /**< set which motion vector modes to draw */
+
+  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
+   * for its control ids. These should be migrated to something like the
+   * AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
+   */
+  AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
+  AOM_COMMON_CTRL_ID_MAX,
+
+  AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
+
+  AOM_DECODER_CTRL_ID_START = 256
+};
+
+/*!\brief post process flags
+ *
+ * The set of macros define AOM decoder post processing flags
+ */
+enum aom_postproc_level {
+  AOM_NOFILTERING = 0,
+  AOM_DEBLOCK = 1 << 0,
+  AOM_DEMACROBLOCK = 1 << 1,
+  AOM_ADDNOISE = 1 << 2,
+  AOM_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
+  AOM_DEBUG_TXT_MBLK_MODES =
+      1 << 4, /**< print macro block modes over each macro block */
+  AOM_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
+  AOM_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
+  AOM_MFQE = 1 << 10
+};
+
+/*!\brief post process flags
+ *
+ * This define a structure that describe the post processing settings. For
+ * the best objective measure (using the PSNR metric) set post_proc_flag
+ * to AOM_DEBLOCK and deblocking_level to 1.
+ */
+
+typedef struct aom_postproc_cfg {
+  /*!\brief the types of post processing to be done, should be combination of
+   * "aom_postproc_level" */
+  int post_proc_flag;
+  int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
+  int noise_level; /**< the strength of additive noise, valid range [0, 16] */
+} aom_postproc_cfg_t;
+
+/*!\brief reference frame type
+ *
+ * The set of macros define the type of AOM reference frames
+ */
+typedef enum aom_ref_frame_type {
+  AOM_LAST_FRAME = 1,
+  AOM_GOLD_FRAME = 2,
+  AOM_ALTR_FRAME = 4
+} aom_ref_frame_type_t;
+
+/*!\brief reference frame data struct
+ *
+ * Define the data struct to access aom reference frames.
+ */
+typedef struct aom_ref_frame {
+  aom_ref_frame_type_t frame_type; /**< which reference frame */
+  aom_image_t img;                 /**< reference frame data in image format */
+} aom_ref_frame_t;
+
+/*!\brief AV1 specific reference frame data struct
+ *
+ * Define the data struct to access av1 reference frames.
+ */
+typedef struct av1_ref_frame {
+  int idx;         /**< frame index to get (input) */
+  aom_image_t img; /**< img structure to populate (output) */
+} av1_ref_frame_t;
+
+/*!\cond */
+/*!\brief aom decoder control function parameter type
+ *
+ * defines the data type for each of AOM decoder control function requires
+ */
+AOM_CTRL_USE_TYPE(AOM_SET_REFERENCE, aom_ref_frame_t *)
+#define AOM_CTRL_AOM_SET_REFERENCE
+AOM_CTRL_USE_TYPE(AOM_COPY_REFERENCE, aom_ref_frame_t *)
+#define AOM_CTRL_AOM_COPY_REFERENCE
+AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
+#define AOM_CTRL_AOM_SET_POSTPROC
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
+#define AOM_CTRL_AOM_SET_DBG_COLOR_REF_FRAME
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_MB_MODES, int)
+#define AOM_CTRL_AOM_SET_DBG_COLOR_MB_MODES
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_B_MODES, int)
+#define AOM_CTRL_AOM_SET_DBG_COLOR_B_MODES
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
+#define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
+AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_GET_REFERENCE
+AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
+
+/*!\endcond */
+/*! @} - end defgroup aom */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_H_
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

 /*!\defgroup codec Common Algorithm Interface
@@ -22,59 +23,69 @@
 * video codec algorithm.
 *
 * An application instantiates a specific codec instance by using
- * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ * aom_codec_init() and a pointer to the algorithm's interface structure:
 *     <pre>
 *     my_app.c:
- *       extern vpx_codec_iface_t my_codec;
+ *       extern aom_codec_iface_t my_codec;
 *       {
- *           vpx_codec_ctx_t algo;
- *           res = vpx_codec_init(&algo, &my_codec);
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
 *       }
 *     </pre>
 *
 * Once initialized, the instance is manged using other functions from
- * the vpx_codec_* family.
+ * the aom_codec_* family.
 */
-#ifndef VPX_VPX_CODEC_H_
-#define VPX_VPX_CODEC_H_
+#ifndef AOM_AOM_CODEC_H_
+#define AOM_AOM_CODEC_H_

 #ifdef __cplusplus
 extern "C" {
 #endif

-#include "./vpx_image.h"
-#include "./vpx_integer.h"
+#include "./aom_integer.h"
+#include "./aom_image.h"

 /*!\brief Decorator indicating a function is deprecated */
-#ifndef VPX_DEPRECATED
+#ifndef DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define VPX_DEPRECATED __attribute__((deprecated))
+#define DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
-#define VPX_DEPRECATED
+#define DEPRECATED
 #else
-#define VPX_DEPRECATED
+#define DEPRECATED
 #endif
-#endif /* VPX_DEPRECATED */
+#endif /* DEPRECATED */

-#ifndef VPX_DECLSPEC_DEPRECATED
+#ifndef DECLSPEC_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
+#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #elif defined(_MSC_VER)
-/*!\brief \copydoc #VPX_DEPRECATED */
-#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated)
+/*!\brief \copydoc #DEPRECATED */
+#define DECLSPEC_DEPRECATED __declspec(deprecated)
 #else
-#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
+#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #endif
-#endif /* VPX_DECLSPEC_DEPRECATED */
+#endif /* DECLSPEC_DEPRECATED */

 /*!\brief Decorator indicating a function is potentially unused */
-#ifndef VPX_UNUSED
-#if defined(__GNUC__) || defined(__clang__)
-#define VPX_UNUSED __attribute__((unused))
+#ifdef UNUSED
+#elif defined(__GNUC__) || defined(__clang__)
+#define UNUSED __attribute__((unused))
 #else
-#define VPX_UNUSED
+#define UNUSED
 #endif
-#endif /* VPX_UNUSED */
+
+/*!\brief Decorator indicating that given struct/union/enum is packed */
+#ifndef ATTRIBUTE_PACKED
+#if defined(__GNUC__) && __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((packed))
+#elif defined(_MSC_VER)
+#define ATTRIBUTE_PACKED
+#else
+#define ATTRIBUTE_PACKED
+#endif
+#endif /* ATTRIBUTE_PACKED */

 /*!\brief Current ABI version number
 *
@@ -84,31 +95,31 @@ extern "C" {
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
-#define VPX_CODEC_ABI_VERSION (4 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define AOM_CODEC_ABI_VERSION (3 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/

 /*!\brief Algorithm return codes */
 typedef enum {
  /*!\brief Operation completed without error */
-  VPX_CODEC_OK,
+  AOM_CODEC_OK,

  /*!\brief Unspecified error */
-  VPX_CODEC_ERROR,
+  AOM_CODEC_ERROR,

  /*!\brief Memory operation failed */
-  VPX_CODEC_MEM_ERROR,
+  AOM_CODEC_MEM_ERROR,

  /*!\brief ABI version mismatch */
-  VPX_CODEC_ABI_MISMATCH,
+  AOM_CODEC_ABI_MISMATCH,

  /*!\brief Algorithm does not have required capability */
-  VPX_CODEC_INCAPABLE,
+  AOM_CODEC_INCAPABLE,

  /*!\brief The given bitstream is not supported.
   *
   * The bitstream was unable to be parsed at the highest level. The decoder
   * is unable to proceed. This error \ref SHOULD be treated as fatal to the
   * stream. */
-  VPX_CODEC_UNSUP_BITSTREAM,
+  AOM_CODEC_UNSUP_BITSTREAM,

  /*!\brief Encoded bitstream uses an unsupported feature
   *
@@ -117,7 +128,7 @@ typedef enum {
   * pictures from being properly decoded. This error \ref MAY be treated as
   * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
   */
-  VPX_CODEC_UNSUP_FEATURE,
+  AOM_CODEC_UNSUP_FEATURE,

  /*!\brief The coded data for this stream is corrupt or incomplete
   *
@@ -127,64 +138,60 @@ typedef enum {
   * stream or \ref MAY be treated as fatal to the current GOP. If decoding
   * is continued for the current GOP, artifacts may be present.
   */
-  VPX_CODEC_CORRUPT_FRAME,
+  AOM_CODEC_CORRUPT_FRAME,

  /*!\brief An application-supplied parameter is not valid.
   *
   */
-  VPX_CODEC_INVALID_PARAM,
+  AOM_CODEC_INVALID_PARAM,

  /*!\brief An iterator reached the end of list.
   *
   */
-  VPX_CODEC_LIST_END
+  AOM_CODEC_LIST_END

-} vpx_codec_err_t;
+} aom_codec_err_t;

 /*! \brief Codec capabilities bitfield
 *
 *  Each codec advertises the capabilities it supports as part of its
- *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
 *  or functionality, and are not required to be supported.
 *
- *  The available flags are specified by VPX_CODEC_CAP_* defines.
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
 */
-typedef long vpx_codec_caps_t;
-#define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
-#define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
-
-/*! Can support images at greater than 8 bitdepth.
- */
-#define VPX_CODEC_CAP_HIGHBITDEPTH 0x4
+typedef long aom_codec_caps_t;
+#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
+#define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */

 /*! \brief Initialization-time Feature Enabling
 *
 *  Certain codec features must be known at initialization time, to allow for
 *  proper memory allocation.
 *
- *  The available flags are specified by VPX_CODEC_USE_* defines.
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
 */
-typedef long vpx_codec_flags_t;
+typedef long aom_codec_flags_t;

 /*!\brief Codec interface structure.
 *
 * Contains function pointers and other data private to the codec
 * implementation. This structure is opaque to the application.
 */
-typedef const struct vpx_codec_iface vpx_codec_iface_t;
+typedef const struct aom_codec_iface aom_codec_iface_t;

 /*!\brief Codec private data structure.
 *
 * Contains data private to the codec implementation. This structure is opaque
 * to the application.
 */
-typedef struct vpx_codec_priv vpx_codec_priv_t;
+typedef struct aom_codec_priv aom_codec_priv_t;

 /*!\brief Iterator
 *
 * Opaque storage used for iterating over lists.
 */
-typedef const void *vpx_codec_iter_t;
+typedef const void *aom_codec_iter_t;

 /*!\brief Codec context structure
 *
@@ -194,39 +201,51 @@ typedef const void *vpx_codec_iter_t;
 * may reference the 'name' member to get a printable description of the
 * algorithm.
 */
-typedef struct vpx_codec_ctx {
+typedef struct aom_codec_ctx {
  const char *name;             /**< Printable interface name */
-  vpx_codec_iface_t *iface;     /**< Interface pointers */
-  vpx_codec_err_t err;          /**< Last returned error */
+  aom_codec_iface_t *iface;     /**< Interface pointers */
+  aom_codec_err_t err;          /**< Last returned error */
  const char *err_detail;       /**< Detailed info, if available */
-  vpx_codec_flags_t init_flags; /**< Flags passed at init time */
+  aom_codec_flags_t init_flags; /**< Flags passed at init time */
  union {
    /**< Decoder Configuration Pointer */
-    const struct vpx_codec_dec_cfg *dec;
+    const struct aom_codec_dec_cfg *dec;
    /**< Encoder Configuration Pointer */
-    const struct vpx_codec_enc_cfg *enc;
+    const struct aom_codec_enc_cfg *enc;
    const void *raw;
  } config;               /**< Configuration pointer aliasing union */
-  vpx_codec_priv_t *priv; /**< Algorithm private storage */
-} vpx_codec_ctx_t;
+  aom_codec_priv_t *priv; /**< Algorithm private storage */
+} aom_codec_ctx_t;

 /*!\brief Bit depth for codec
 * *
 * This enumeration determines the bit depth of the codec.
 */
-typedef enum vpx_bit_depth {
-  VPX_BITS_8 = 8,   /**<  8 bits */
-  VPX_BITS_10 = 10, /**< 10 bits */
-  VPX_BITS_12 = 12, /**< 12 bits */
-} vpx_bit_depth_t;
+typedef enum aom_bit_depth {
+  AOM_BITS_8 = 8,   /**<  8 bits */
+  AOM_BITS_10 = 10, /**< 10 bits */
+  AOM_BITS_12 = 12, /**< 12 bits */
+} aom_bit_depth_t;
+
+/*!\brief Superblock size selection.
+ *
+ * Defines the superblock size used for encoding. The superblock size can
+ * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+ * selected by the encoder for each frame.
+ */
+typedef enum aom_superblock_size {
+  AOM_SUPERBLOCK_SIZE_64X64,   /**< Always use 64x64 superblocks. */
+  AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
+  AOM_SUPERBLOCK_SIZE_DYNAMIC  /**< Select superblock size dynamically. */
+} aom_superblock_size_t;

 /*
 * Library Version Number Interface
 *
 * For example, see the following sample return values:
- *     vpx_codec_version()           (1<<16 | 2<<8 | 3)
- *     vpx_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
- *     vpx_codec_version_extra_str() "rc1-16-gec6a1ba"
+ *     aom_codec_version()           (1<<16 | 2<<8 | 3)
+ *     aom_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
+ *     aom_codec_version_extra_str() "rc1-16-gec6a1ba"
 */

 /*!\brief Return the version information (as an integer)
@@ -239,22 +258,22 @@ typedef enum vpx_bit_depth {
 * in the future.
 *
 */
-int vpx_codec_version(void);
-#define VPX_VERSION_MAJOR(v) \
+int aom_codec_version(void);
+#define AOM_VERSION_MAJOR(v) \
  ((v >> 16) & 0xff) /**< extract major from packed version */
-#define VPX_VERSION_MINOR(v) \
+#define AOM_VERSION_MINOR(v) \
  ((v >> 8) & 0xff) /**< extract minor from packed version */
-#define VPX_VERSION_PATCH(v) \
+#define AOM_VERSION_PATCH(v) \
  ((v >> 0) & 0xff) /**< extract patch from packed version */

 /*!\brief Return the version major number */
-#define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff)
+#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)

 /*!\brief Return the version minor number */
-#define vpx_codec_version_minor() ((vpx_codec_version() >> 8) & 0xff)
+#define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)

 /*!\brief Return the version patch number */
-#define vpx_codec_version_patch() ((vpx_codec_version() >> 0) & 0xff)
+#define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)

 /*!\brief Return the version information (as a string)
 *
@@ -265,24 +284,24 @@ int vpx_codec_version(void);
 * release candidates, prerelease versions, etc.
 *
 */
-const char *vpx_codec_version_str(void);
+const char *aom_codec_version_str(void);

 /*!\brief Return the version information (as a string)
 *
 * Returns a printable "extra string". This is the component of the string
 * returned
- * by vpx_codec_version_str() following the three digit version number.
+ * by aom_codec_version_str() following the three digit version number.
 *
 */
-const char *vpx_codec_version_extra_str(void);
+const char *aom_codec_version_extra_str(void);

 /*!\brief Return the build configuration
 *
 * Returns a printable string containing an encoded version of the build
- * configuration. This may be useful to vpx support.
+ * configuration. This may be useful to aom support.
 *
 */
-const char *vpx_codec_build_config(void);
+const char *aom_codec_build_config(void);

 /*!\brief Return the name for a given interface
 *
@@ -291,7 +310,7 @@ const char *vpx_codec_build_config(void);
 * \param[in]    iface     Interface pointer
 *
 */
-const char *vpx_codec_iface_name(vpx_codec_iface_t *iface);
+const char *aom_codec_iface_name(aom_codec_iface_t *iface);

 /*!\brief Convert error number to printable string
 *
@@ -303,7 +322,7 @@ const char *vpx_codec_iface_name(vpx_codec_iface_t *iface);
 * \param[in]    err     Error number.
 *
 */
-const char *vpx_codec_err_to_string(vpx_codec_err_t err);
+const char *aom_codec_err_to_string(aom_codec_err_t err);

 /*!\brief Retrieve error synopsis for codec context
 *
@@ -315,7 +334,7 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err);
 * \param[in]    ctx     Pointer to this instance's context.
 *
 */
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
+const char *aom_codec_error(aom_codec_ctx_t *ctx);

 /*!\brief Retrieve detailed error information for codec context
 *
@@ -327,7 +346,7 @@ const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
 * \retval NULL
 *     No detailed information is available.
 */
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
+const char *aom_codec_error_detail(aom_codec_ctx_t *ctx);

 /* REQUIRED FUNCTIONS
 *
@@ -341,12 +360,12 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
 *
 * \param[in] ctx   Pointer to this instance's context
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The codec algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
+ * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
-vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx);

 /*!\brief Get the capabilities of an algorithm.
 *
@@ -355,7 +374,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
 * \param[in] iface   Pointer to the algorithm interface
 *
 */
-vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface);
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);

 /*!\brief Control algorithm
 *
@@ -365,46 +384,46 @@ vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface);
 *
 * This wrapper function dispatches the request to the helper function
 * associated with the given ctrl_id. It tries to call this function
- * transparently, but will return #VPX_CODEC_ERROR if the request could not
+ * transparently, but will return #AOM_CODEC_ERROR if the request could not
 * be dispatched.
 *
 * Note that this function should not be used directly. Call the
- * #vpx_codec_control wrapper macro instead.
+ * #aom_codec_control wrapper macro instead.
 *
 * \param[in]     ctx              Pointer to this instance's context
 * \param[in]     ctrl_id          Algorithm specific control identifier
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The control request was processed.
- * \retval #VPX_CODEC_ERROR
+ * \retval #AOM_CODEC_ERROR
 *     The control request was not processed.
- * \retval #VPX_CODEC_INVALID_PARAM
+ * \retval #AOM_CODEC_INVALID_PARAM
 *     The data was not valid.
 */
-vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
-#if defined(VPX_DISABLE_CTRL_TYPECHECKS) && VPX_DISABLE_CTRL_TYPECHECKS
-#define vpx_codec_control(ctx, id, data) vpx_codec_control_(ctx, id, data)
-#define VPX_CTRL_USE_TYPE(id, typ)
-#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)
-#define VPX_CTRL_VOID(id, typ)
+aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
+#if defined(AOM_DISABLE_CTRL_TYPECHECKS) && AOM_DISABLE_CTRL_TYPECHECKS
+#define aom_codec_control(ctx, id, data) aom_codec_control_(ctx, id, data)
+#define AOM_CTRL_USE_TYPE(id, typ)
+#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)
+#define AOM_CTRL_VOID(id, typ)

 #else
-/*!\brief vpx_codec_control wrapper macro
+/*!\brief aom_codec_control wrapper macro
 *
 * This macro allows for type safe conversions across the variadic parameter
- * to vpx_codec_control_().
+ * to aom_codec_control_().
 *
 * \internal
 * It works by dispatching the call to the control function through a wrapper
 * function named with the id parameter.
 */
-#define vpx_codec_control(ctx, id, data) \
-  vpx_codec_control_##id(ctx, id, data) /**<\hideinitializer*/
+#define aom_codec_control(ctx, id, data) \
+  aom_codec_control_##id(ctx, id, data) /**<\hideinitializer*/

-/*!\brief vpx_codec_control type definition macro
+/*!\brief aom_codec_control type definition macro
 *
 * This macro allows for type safe conversions across the variadic parameter
- * to vpx_codec_control_(). It defines the type of the argument for a given
+ * to aom_codec_control_(). It defines the type of the argument for a given
 * control identifier.
 *
 * \internal
@@ -412,18 +431,18 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
 * the correctly typed arguments as a wrapper to the type-unsafe internal
 * function.
 */
-#define VPX_CTRL_USE_TYPE(id, typ)                                           \
-  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \
-      VPX_UNUSED;                                                            \
+#define AOM_CTRL_USE_TYPE(id, typ)                                           \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int, typ) \
+      UNUSED;                                                                \
                                                                             \
-  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,        \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,        \
                                                int ctrl_id, typ data) {     \
-    return vpx_codec_control_(ctx, ctrl_id, data);                           \
+    return aom_codec_control_(ctx, ctrl_id, data);                           \
  } /**<\hideinitializer*/

-/*!\brief vpx_codec_control deprecated type definition macro
+/*!\brief aom_codec_control deprecated type definition macro
 *
- * Like #VPX_CTRL_USE_TYPE, but indicates that the specified control is
+ * Like #AOM_CTRL_USE_TYPE, but indicates that the specified control is
 * deprecated and should not be used. Consult the documentation for your
 * codec for more information.
 *
@@ -431,32 +450,32 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
 * It defines a static function with the correctly typed arguments as a
 * wrapper to the type-unsafe internal function.
 */
-#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                            \
-  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED;            \
+#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)                        \
+  DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
+      aom_codec_ctx_t *, int, typ) DEPRECATED UNUSED;                \
                                                                     \
-  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                     \
-    return vpx_codec_control_(ctx, ctrl_id, data);                       \
+  DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
+      aom_codec_ctx_t *ctx, int ctrl_id, typ data) {                 \
+    return aom_codec_control_(ctx, ctrl_id, data);                   \
  } /**<\hideinitializer*/

-/*!\brief vpx_codec_control void type definition macro
+/*!\brief aom_codec_control void type definition macro
 *
 * This macro allows for type safe conversions across the variadic parameter
- * to vpx_codec_control_(). It indicates that a given control identifier takes
+ * to aom_codec_control_(). It indicates that a given control identifier takes
 * no argument.
 *
 * \internal
 * It defines a static function without a data argument as a wrapper to the
 * type-unsafe internal function.
 */
-#define VPX_CTRL_VOID(id)                                               \
-  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \
-      VPX_UNUSED;                                                       \
+#define AOM_CTRL_VOID(id)                                               \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int) \
+      UNUSED;                                                           \
                                                                        \
-  static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,   \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,   \
                                                int ctrl_id) {          \
-    return vpx_codec_control_(ctx, ctrl_id);                            \
+    return aom_codec_control_(ctx, ctrl_id);                            \
  } /**<\hideinitializer*/

 #endif
@@ -465,4 +484,4 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_CODEC_H_
+#endif  // AOM_AOM_CODEC_H_
--- a/aom/aom_codec.mk
+++ b/aom/aom_codec.mk
@@ -0,0 +1,42 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+
+API_EXPORTS += exports
+
+API_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
+API_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
+API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
+API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
+
+API_SRCS-$(CONFIG_AV1_DECODER) += aom.h
+API_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
+API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aom.h
+API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
+
+API_DOC_SRCS-yes += aom_codec.h
+API_DOC_SRCS-yes += aom_decoder.h
+API_DOC_SRCS-yes += aom_encoder.h
+API_DOC_SRCS-yes += aom_frame_buffer.h
+API_DOC_SRCS-yes += aom_image.h
+
+API_SRCS-yes += src/aom_decoder.c
+API_SRCS-yes += aom_decoder.h
+API_SRCS-yes += src/aom_encoder.c
+API_SRCS-yes += aom_encoder.h
+API_SRCS-yes += internal/aom_codec_internal.h
+API_SRCS-yes += src/aom_codec.c
+API_SRCS-yes += src/aom_image.c
+API_SRCS-yes += aom_codec.h
+API_SRCS-yes += aom_codec.mk
+API_SRCS-yes += aom_frame_buffer.h
+API_SRCS-yes += aom_image.h
+API_SRCS-yes += aom_integer.h
--- a/aom/aom_decoder.h
+++ b/aom/aom_decoder.h
@@ -1,14 +1,15 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
-#ifndef VPX_VPX_DECODER_H_
-#define VPX_VPX_DECODER_H_
+#ifndef AOM_AOM_DECODER_H_
+#define AOM_AOM_DECODER_H_

 /*!\defgroup decoder Decoder Algorithm Interface
 * \ingroup codec
@@ -29,8 +30,8 @@
 extern "C" {
 #endif

-#include "./vpx_codec.h"
-#include "./vpx_frame_buffer.h"
+#include "./aom_codec.h"
+#include "./aom_frame_buffer.h"

 /*!\brief Current ABI version number
 *
@@ -40,45 +41,45 @@ extern "C" {
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
-#define VPX_DECODER_ABI_VERSION \
-  (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define AOM_DECODER_ABI_VERSION \
+  (3 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/

 /*! \brief Decoder capabilities bitfield
 *
 *  Each decoder advertises the capabilities it supports as part of its
- *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
 *  or functionality, and are not required to be supported by a decoder.
 *
- *  The available flags are specified by VPX_CODEC_CAP_* defines.
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
 */
-#define VPX_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
-#define VPX_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
-#define VPX_CODEC_CAP_POSTPROC 0x40000  /**< Can postprocess decoded frame */
+#define AOM_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
+#define AOM_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
+#define AOM_CODEC_CAP_POSTPROC 0x40000  /**< Can postprocess decoded frame */
 /*!\brief Can conceal errors due to packet loss */
-#define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000
+#define AOM_CODEC_CAP_ERROR_CONCEALMENT 0x80000
 /*!\brief Can receive encoded frames one fragment at a time */
-#define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000
+#define AOM_CODEC_CAP_INPUT_FRAGMENTS 0x100000

 /*! \brief Initialization-time Feature Enabling
 *
 *  Certain codec features must be known at initialization time, to allow for
 *  proper memory allocation.
 *
- *  The available flags are specified by VPX_CODEC_USE_* defines.
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
 */
 /*!\brief Can support frame-based multi-threading */
-#define VPX_CODEC_CAP_FRAME_THREADING 0x200000
+#define AOM_CODEC_CAP_FRAME_THREADING 0x200000
 /*!brief Can support external frame buffers */
-#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
+#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000

-#define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
+#define AOM_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
 /*!\brief Conceal errors in decoded frames */
-#define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000
+#define AOM_CODEC_USE_ERROR_CONCEALMENT 0x20000
 /*!\brief The input frame should be passed to the decoder one fragment at a
 * time */
-#define VPX_CODEC_USE_INPUT_FRAGMENTS 0x40000
+#define AOM_CODEC_USE_INPUT_FRAGMENTS 0x40000
 /*!\brief Enable frame-based multi-threading */
-#define VPX_CODEC_USE_FRAME_THREADING 0x80000
+#define AOM_CODEC_USE_FRAME_THREADING 0x80000

 /*!\brief Stream properties
 *
@@ -86,12 +87,12 @@ extern "C" {
 * stream. Algorithms may extend this structure with data specific
 * to their bitstream by setting the sz member appropriately.
 */
-typedef struct vpx_codec_stream_info {
+typedef struct aom_codec_stream_info {
  unsigned int sz;    /**< Size of this structure */
  unsigned int w;     /**< Width (or 0 for unknown/default) */
  unsigned int h;     /**< Height (or 0 for unknown/default) */
  unsigned int is_kf; /**< Current frame is a keyframe */
-} vpx_codec_stream_info_t;
+} aom_codec_stream_info_t;

 /* REQUIRED FUNCTIONS
 *
@@ -104,16 +105,16 @@ typedef struct vpx_codec_stream_info {
 * This structure is used to pass init time configuration options to the
 * decoder.
 */
-typedef struct vpx_codec_dec_cfg {
+typedef struct aom_codec_dec_cfg {
  unsigned int threads; /**< Maximum number of threads to use, default 1 */
  unsigned int w;       /**< Width */
  unsigned int h;       /**< Height */
-} vpx_codec_dec_cfg_t;  /**< alias for struct vpx_codec_dec_cfg */
+} aom_codec_dec_cfg_t;  /**< alias for struct aom_codec_dec_cfg */

 /*!\brief Initialize a decoder instance
 *
 * Initializes a decoder context using the given interface. Applications
- * should call the vpx_codec_dec_init convenience macro instead of this
+ * should call the aom_codec_dec_init convenience macro instead of this
 * function directly, to ensure that the ABI version number parameter
 * is properly initialized.
 *
@@ -124,25 +125,25 @@ typedef struct vpx_codec_dec_cfg {
 * \param[in]    ctx     Pointer to this instance's context.
 * \param[in]    iface   Pointer to the algorithm interface to use.
 * \param[in]    cfg     Configuration to use, if known. May be NULL.
- * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    ver     ABI version number. Must be set to
- *                       VPX_DECODER_ABI_VERSION
- * \retval #VPX_CODEC_OK
+ *                       AOM_DECODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
 *     The decoder algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
+ * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
-vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
-                                       vpx_codec_iface_t *iface,
-                                       const vpx_codec_dec_cfg_t *cfg,
-                                       vpx_codec_flags_t flags, int ver);
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_dec_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver);

-/*!\brief Convenience macro for vpx_codec_dec_init_ver()
+/*!\brief Convenience macro for aom_codec_dec_init_ver()
 *
 * Ensures the ABI version parameter is properly set.
 */
-#define vpx_codec_dec_init(ctx, iface, cfg, flags) \
-  vpx_codec_dec_init_ver(ctx, iface, cfg, flags, VPX_DECODER_ABI_VERSION)
+#define aom_codec_dec_init(ctx, iface, cfg, flags) \
+  aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION)

 /*!\brief Parse stream info from a buffer
 *
@@ -158,13 +159,13 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
-vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface,
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
                                           const uint8_t *data,
                                           unsigned int data_sz,
-                                           vpx_codec_stream_info_t *si);
+                                           aom_codec_stream_info_t *si);

 /*!\brief Return information about the current stream.
 *
@@ -176,11 +177,11 @@ vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface,
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
-vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
-                                          vpx_codec_stream_info_t *si);
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+                                          aom_codec_stream_info_t *si);

 /*!\brief Decode data
 *
@@ -189,7 +190,7 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
 * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
 * time stamp) order. Frames produced will always be in PTS (presentation
 * time stamp) order.
- * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled,
+ * If the decoder is configured with AOM_CODEC_USE_INPUT_FRAGMENTS enabled,
 * data and data_sz can contain a fragment of the encoded frame. Fragment
 * \#n must contain at least partition \#n, but can also contain subsequent
 * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
@@ -199,7 +200,7 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] data         Pointer to this block of new coded data. If
- *                         NULL, a VPX_CODEC_CB_PUT_FRAME event is posted
+ *                         NULL, a AOM_CODEC_CB_PUT_FRAME event is posted
 *                         for the previously decoded frame.
 * \param[in] data_sz      Size of the coded data, in bytes.
 * \param[in] user_priv    Application specific data to associate with
@@ -207,12 +208,12 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx,
 * \param[in] deadline     Soft deadline the decoder should attempt to meet,
 *                         in us. Set to zero for unlimited.
 *
- * \return Returns #VPX_CODEC_OK if the coded data was processed completely
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
 *         and future pictures can be decoded without error. Otherwise,
- *         see the descriptions of the other error codes in ::vpx_codec_err_t
+ *         see the descriptions of the other error codes in ::aom_codec_err_t
 *         for recoverability capabilities.
 */
-vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
                                 unsigned int data_sz, void *user_priv,
                                 long deadline);

@@ -223,8 +224,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
 * complete when this function returns NULL.
 *
 * The list of available frames becomes valid upon completion of the
- * vpx_codec_decode call, and remains valid until the next call to
- * vpx_codec_decode.
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
 *
 * \param[in]     ctx      Pointer to this instance's context
 * \param[in,out] iter     Iterator storage, initialized to NULL
@@ -232,15 +233,15 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data,
 * \return Returns a pointer to an image, if one is ready for display. Frames
 *         produced will always be in PTS (presentation time stamp) order.
 */
-vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter);
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter);

 /*!\defgroup cap_put_frame Frame-Based Decoding Functions
 *
 * The following functions are required to be implemented for all decoders
- * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these
+ * that advertise the AOM_CODEC_CAP_PUT_FRAME capability. Calling these
 * functions
 * for codecs that don't advertise this capability will result in an error
- * code being returned, usually VPX_CODEC_ERROR
+ * code being returned, usually AOM_CODEC_ERROR
 * @{
 */

@@ -249,8 +250,8 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter);
 * This callback is invoked by the decoder to notify the application of
 * the availability of decoded image data.
 */
-typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv,
-                                            const vpx_image_t *img);
+typedef void (*aom_codec_put_frame_cb_fn_t)(void *user_priv,
+                                            const aom_image_t *img);

 /*!\brief Register for notification of frame completion.
 *
@@ -261,14 +262,14 @@ typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv,
 * \param[in] cb           Pointer to the callback function
 * \param[in] user_priv    User's private data
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     Callback successfully registered.
- * \retval #VPX_CODEC_ERROR
+ * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     posting slice completion.
 */
-vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
-                                                vpx_codec_put_frame_cb_fn_t cb,
+aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_frame_cb_fn_t cb,
                                                void *user_priv);

 /*!@} - end defgroup cap_put_frame */
@@ -276,10 +277,10 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
 /*!\defgroup cap_put_slice Slice-Based Decoding Functions
 *
 * The following functions are required to be implemented for all decoders
- * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these
+ * that advertise the AOM_CODEC_CAP_PUT_SLICE capability. Calling these
 * functions
 * for codecs that don't advertise this capability will result in an error
- * code being returned, usually VPX_CODEC_ERROR
+ * code being returned, usually AOM_CODEC_ERROR
 * @{
 */

@@ -288,10 +289,10 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx,
 * This callback is invoked by the decoder to notify the application of
 * the availability of partially decoded image data. The
 */
-typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv,
-                                            const vpx_image_t *img,
-                                            const vpx_image_rect_t *valid,
-                                            const vpx_image_rect_t *update);
+typedef void (*aom_codec_put_slice_cb_fn_t)(void *user_priv,
+                                            const aom_image_t *img,
+                                            const aom_image_rect_t *valid,
+                                            const aom_image_rect_t *update);

 /*!\brief Register for notification of slice completion.
 *
@@ -302,14 +303,14 @@ typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv,
 * \param[in] cb           Pointer to the callback function
 * \param[in] user_priv    User's private data
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     Callback successfully registered.
- * \retval #VPX_CODEC_ERROR
+ * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     posting slice completion.
 */
-vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
-                                                vpx_codec_put_slice_cb_fn_t cb,
+aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_slice_cb_fn_t cb,
                                                void *user_priv);

 /*!@} - end defgroup cap_put_slice*/
@@ -317,21 +318,21 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
 /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
 *
 * The following section is required to be implemented for all decoders
- * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+ * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
 * Calling this function for codecs that don't advertise this capability
- * will result in an error code being returned, usually VPX_CODEC_ERROR.
+ * will result in an error code being returned, usually AOM_CODEC_ERROR.
 *
 * \note
- * Currently this only works with VP9.
+ * Currently this only works with AV1.
 * @{
 */

 /*!\brief Pass in external frame buffers for the decoder to use.
 *
- * Registers functions to be called when libvpx needs a frame buffer
- * to decode the current frame and a function to be called when libvpx does
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
 * not internally reference the frame buffer. This set function must
- * be called before the first call to decode or libvpx will assume the
+ * be called before the first call to decode or libaom will assume the
 * default behavior of allocating frame buffers internally.
 *
 * \param[in] ctx          Pointer to this instance's context
@@ -339,22 +340,22 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx,
 * \param[in] cb_release   Pointer to the release callback function
 * \param[in] cb_priv      Callback's private data
 *
- * \retval #VPX_CODEC_OK
- *     External frame buffers will be used by libvpx.
- * \retval #VPX_CODEC_INVALID_PARAM
+ * \retval #AOM_CODEC_OK
+ *     External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
 *     One or more of the callbacks were NULL.
- * \retval #VPX_CODEC_ERROR
+ * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     using external frame buffers.
 *
 * \note
- * When decoding VP9, the application may be required to pass in at least
- * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame
 * buffers.
 */
-vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
-    vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
-    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);

 /*!@} - end defgroup cap_external_frame_buffer */

@@ -362,4 +363,4 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_DECODER_H_
+#endif  // AOM_AOM_DECODER_H_
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -1,14 +1,15 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
-#ifndef VPX_VPX_ENCODER_H_
-#define VPX_VPX_ENCODER_H_
+#ifndef AOM_AOM_ENCODER_H_
+#define AOM_AOM_ENCODER_H_

 /*!\defgroup encoder Encoder Algorithm Interface
 * \ingroup codec
@@ -29,30 +30,7 @@
 extern "C" {
 #endif

-#include "./vpx_codec.h"
-
-/*! Temporal Scalability: Maximum length of the sequence defining frame
- * layer membership
- */
-#define VPX_TS_MAX_PERIODICITY 16
-
-/*! Temporal Scalability: Maximum number of coding layers */
-#define VPX_TS_MAX_LAYERS 5
-
-/*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */
-#define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY
-
-/*! Temporal+Spatial Scalability: Maximum number of coding layers */
-#define VPX_MAX_LAYERS 12  // 3 temporal + 4 spatial layers are allowed.
-
-/*!\deprecated Use #VPX_MAX_LAYERS instead. */
-#define MAX_LAYERS VPX_MAX_LAYERS  // 3 temporal + 4 spatial layers allowed.
-
-/*! Spatial Scalability: Maximum number of coding layers */
-#define VPX_SS_MAX_LAYERS 5
-
-/*! Spatial Scalability: Default number of coding layers */
-#define VPX_SS_DEFAULT_LAYERS 1
+#include "./aom_codec.h"

 /*!\brief Current ABI version number
 *
@@ -62,54 +40,58 @@ extern "C" {
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
-#define VPX_ENCODER_ABI_VERSION \
-  (11 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define AOM_ENCODER_ABI_VERSION \
+  (5 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/

 /*! \brief Encoder capabilities bitfield
 *
 *  Each encoder advertises the capabilities it supports as part of its
- *  ::vpx_codec_iface_t interface structure. Capabilities are extra
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra
 *  interfaces or functionality, and are not required to be supported
 *  by an encoder.
 *
- *  The available flags are specified by VPX_CODEC_CAP_* defines.
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
 */
-#define VPX_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
+#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */

 /*! Can output one partition at a time. Each partition is returned in its
- *  own VPX_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
+ *  own AOM_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
 *  every partition but the last. In this mode all frames are always
 *  returned partition by partition.
 */
-#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000
+#define AOM_CODEC_CAP_OUTPUT_PARTITION 0x20000
+
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000

 /*! \brief Initialization-time Feature Enabling
 *
 *  Certain codec features must be known at initialization time, to allow
 *  for proper memory allocation.
 *
- *  The available flags are specified by VPX_CODEC_USE_* defines.
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
 */
-#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
+#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
 /*!\brief Make the encoder output one  partition at a time. */
-#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000
-#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
+#define AOM_CODEC_USE_OUTPUT_PARTITION 0x20000
+#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */

 /*!\brief Generic fixed size buffer structure
 *
 * This structure is able to hold a reference to any fixed size buffer.
 */
-typedef struct vpx_fixed_buf {
+typedef struct aom_fixed_buf {
  void *buf;       /**< Pointer to the data */
  size_t sz;       /**< Length of the buffer, in chars */
-} vpx_fixed_buf_t; /**< alias for struct vpx_fixed_buf */
+} aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */

 /*!\brief Time Stamp Type
 *
 * An integer, which when multiplied by the stream's time base, provides
 * the absolute time of a sample.
 */
-typedef int64_t vpx_codec_pts_t;
+typedef int64_t aom_codec_pts_t;

 /*!\brief Compressed Frame Flags
 *
@@ -118,43 +100,43 @@ typedef int64_t vpx_codec_pts_t;
 * can be used by an algorithm to provide additional detail, for example to
 * support frame types that are codec specific (MPEG-1 D-frames for example)
 */
-typedef uint32_t vpx_codec_frame_flags_t;
-#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+typedef uint32_t aom_codec_frame_flags_t;
+#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
 /*!\brief frame can be dropped without affecting the stream (no future frame
 * depends on this one) */
-#define VPX_FRAME_IS_DROPPABLE 0x2
+#define AOM_FRAME_IS_DROPPABLE 0x2
 /*!\brief frame should be decoded but will not be shown */
-#define VPX_FRAME_IS_INVISIBLE 0x4
+#define AOM_FRAME_IS_INVISIBLE 0x4
 /*!\brief this is a fragment of the encoded frame */
-#define VPX_FRAME_IS_FRAGMENT 0x8
+#define AOM_FRAME_IS_FRAGMENT 0x8

 /*!\brief Error Resilient flags
 *
 * These flags define which error resilient features to enable in the
 * encoder. The flags are specified through the
- * vpx_codec_enc_cfg::g_error_resilient variable.
+ * aom_codec_enc_cfg::g_error_resilient variable.
 */
-typedef uint32_t vpx_codec_er_flags_t;
+typedef uint32_t aom_codec_er_flags_t;
 /*!\brief Improve resiliency against losses of whole frames */
-#define VPX_ERROR_RESILIENT_DEFAULT 0x1
+#define AOM_ERROR_RESILIENT_DEFAULT 0x1
 /*!\brief The frame partitions are independently decodable by the bool decoder,
 * meaning that partitions can be decoded even though earlier partitions have
 * been lost. Note that intra prediction is still done over the partition
 * boundary. */
-#define VPX_ERROR_RESILIENT_PARTITIONS 0x2
+#define AOM_ERROR_RESILIENT_PARTITIONS 0x2

 /*!\brief Encoder output packet variants
 *
 * This enumeration lists the different kinds of data packets that can be
- * returned by calls to vpx_codec_get_cx_data(). Algorithms \ref MAY
+ * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY
 * extend this list to provide additional functionality.
 */
-enum vpx_codec_cx_pkt_kind {
-  VPX_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
-  VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
-  VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
-  VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
-  VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
+enum aom_codec_cx_pkt_kind {
+  AOM_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
+  AOM_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
+  AOM_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
+  AOM_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
+  AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
 };

 /*!\brief Encoder output packet
@@ -162,110 +144,87 @@ enum vpx_codec_cx_pkt_kind {
 * This structure contains the different kinds of output data the encoder
 * may produce while compressing a frame.
 */
-typedef struct vpx_codec_cx_pkt {
-  enum vpx_codec_cx_pkt_kind kind; /**< packet variant */
+typedef struct aom_codec_cx_pkt {
+  enum aom_codec_cx_pkt_kind kind; /**< packet variant */
  union {
    struct {
      void *buf; /**< compressed data buffer */
      size_t sz; /**< length of compressed data */
      /*!\brief time stamp to show frame (in timebase units) */
-      vpx_codec_pts_t pts;
+      aom_codec_pts_t pts;
      /*!\brief duration to show frame (in timebase units) */
      unsigned long duration;
-      vpx_codec_frame_flags_t flags; /**< flags for this frame */
+      aom_codec_frame_flags_t flags; /**< flags for this frame */
      /*!\brief the partition id defines the decoding order of the partitions.
       * Only applicable when "output partition" mode is enabled. First
       * partition has id 0.*/
      int partition_id;
-      /*!\brief Width and height of frames in this packet. VP8 will only use the
-       * first one.*/
-      unsigned int width[VPX_SS_MAX_LAYERS];  /**< frame width */
-      unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */
-      /*!\brief Flag to indicate if spatial layer frame in this packet is
-       * encoded or dropped. VP8 will always be set to 1.*/
-      uint8_t spatial_layer_encoded[VPX_SS_MAX_LAYERS];
    } frame;                            /**< data for compressed frame packet */
-    vpx_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
-    vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
-    struct vpx_psnr_pkt {
+    aom_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
+    aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
+    struct aom_psnr_pkt {
      unsigned int samples[4]; /**< Number of samples, total/y/u/v */
      uint64_t sse[4];         /**< sum squared error, total/y/u/v */
      double psnr[4];          /**< PSNR, total/y/u/v */
    } psnr;                    /**< data for PSNR packet */
-    vpx_fixed_buf_t raw;       /**< data for arbitrary packets */
+    aom_fixed_buf_t raw;       /**< data for arbitrary packets */

    /* This packet size is fixed to allow codecs to extend this
     * interface without having to manage storage for raw packets,
     * i.e., if it's smaller than 128 bytes, you can store in the
     * packet list directly.
     */
-    char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */
+    char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */
  } data;                                               /**< packet data */
-} vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */
-
-/*!\brief Encoder return output buffer callback
- *
- * This callback function, when registered, returns with packets when each
- * spatial layer is encoded.
- */
-// putting the definitions here for now. (agrange: find if there
-// is a better place for this)
-typedef void (*vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt,
-                                                    void *user_data);
-
-/*!\brief Callback function pointer / user data pair storage */
-typedef struct vpx_codec_enc_output_cx_cb_pair {
-  vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
-  void *user_priv; /**< Pointer to private data */
-} vpx_codec_priv_output_cx_pkt_cb_pair_t;
+} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */

 /*!\brief Rational Number
 *
 * This structure holds a fractional value.
 */
-typedef struct vpx_rational {
+typedef struct aom_rational {
  int num;        /**< fraction numerator */
  int den;        /**< fraction denominator */
-} vpx_rational_t; /**< alias for struct vpx_rational */
+} aom_rational_t; /**< alias for struct aom_rational */

 /*!\brief Multi-pass Encoding Pass */
-enum vpx_enc_pass {
-  VPX_RC_ONE_PASS,   /**< Single pass mode */
-  VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */
-  VPX_RC_LAST_PASS   /**< Final pass of multi-pass mode */
+enum aom_enc_pass {
+  AOM_RC_ONE_PASS,   /**< Single pass mode */
+  AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */
+  AOM_RC_LAST_PASS   /**< Final pass of multi-pass mode */
 };

 /*!\brief Rate control mode */
-enum vpx_rc_mode {
-  VPX_VBR, /**< Variable Bit Rate (VBR) mode */
-  VPX_CBR, /**< Constant Bit Rate (CBR) mode */
-  VPX_CQ,  /**< Constrained Quality (CQ)  mode */
-  VPX_Q,   /**< Constant Quality (Q) mode */
+enum aom_rc_mode {
+  AOM_VBR, /**< Variable Bit Rate (VBR) mode */
+  AOM_CBR, /**< Constant Bit Rate (CBR) mode */
+  AOM_CQ,  /**< Constrained Quality (CQ)  mode */
+  AOM_Q,   /**< Constant Quality (Q) mode */
 };

 /*!\brief Keyframe placement mode.
 *
 * This enumeration determines whether keyframes are placed automatically by
 * the encoder or whether this behavior is disabled. Older releases of this
- * SDK were implemented such that VPX_KF_FIXED meant keyframes were disabled.
+ * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled.
 * This name is confusing for this behavior, so the new symbols to be used
- * are VPX_KF_AUTO and VPX_KF_DISABLED.
+ * are AOM_KF_AUTO and AOM_KF_DISABLED.
 */
-enum vpx_kf_mode {
-  VPX_KF_FIXED,       /**< deprecated, implies VPX_KF_DISABLED */
-  VPX_KF_AUTO,        /**< Encoder determines optimal placement automatically */
-  VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
+enum aom_kf_mode {
+  AOM_KF_FIXED,       /**< deprecated, implies AOM_KF_DISABLED */
+  AOM_KF_AUTO,        /**< Encoder determines optimal placement automatically */
+  AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };

 /*!\brief Encoded Frame Flags
 *
- * This type indicates a bitfield to be passed to vpx_codec_encode(), defining
+ * This type indicates a bitfield to be passed to aom_codec_encode(), defining
 * per-frame boolean values. By convention, bits common to all codecs will be
- * named VPX_EFLAG_*, and bits specific to an algorithm will be named
+ * named AOM_EFLAG_*, and bits specific to an algorithm will be named
 * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
 */
-typedef long vpx_enc_frame_flags_t;
-#define VPX_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
+typedef long aom_enc_frame_flags_t;
+#define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */

 /*!\brief Encoder configuration structure
 *
@@ -273,7 +232,7 @@ typedef long vpx_enc_frame_flags_t;
 * across all codecs. This doesn't imply that all codecs support all features,
 * however.
 */
-typedef struct vpx_codec_enc_cfg {
+typedef struct aom_codec_enc_cfg {
  /*
   * generic settings (g)
   */
@@ -327,9 +286,9 @@ typedef struct vpx_codec_enc_cfg {
   *
   * This value identifies the bit_depth of the codec,
   * Only certain bit-depths are supported as identified in the
-   * vpx_bit_depth_t enum.
+   * aom_bit_depth_t enum.
   */
-  vpx_bit_depth_t g_bit_depth;
+  aom_bit_depth_t g_bit_depth;

  /*!\brief Bit-depth of the input frames
   *
@@ -351,7 +310,7 @@ typedef struct vpx_codec_enc_cfg {
   * \ref RECOMMENDED method is to set the timebase to that of the parent
   * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
   */
-  struct vpx_rational g_timebase;
+  struct aom_rational g_timebase;

  /*!\brief Enable error resilient modes.
   *
@@ -359,14 +318,14 @@ typedef struct vpx_codec_enc_cfg {
   * it should enable to take measures for streaming over lossy or noisy
   * links.
   */
-  vpx_codec_er_flags_t g_error_resilient;
+  aom_codec_er_flags_t g_error_resilient;

  /*!\brief Multi-pass Encoding Mode
   *
   * This value should be set to the current phase for multi-pass encoding.
-   * For single pass, set to #VPX_RC_ONE_PASS.
+   * For single pass, set to #AOM_RC_ONE_PASS.
   */
-  enum vpx_enc_pass g_pass;
+  enum aom_enc_pass g_pass;

  /*!\brief Allow lagged encoding
   *
@@ -394,7 +353,7 @@ typedef struct vpx_codec_enc_cfg {
   * trade-off is often acceptable, but for many applications is not. It can
   * be disabled in these cases.
   *
-   * Note that not all codecs support this feature. All vpx VPx codecs do.
+   * Note that not all codecs support this feature. All aom AVx codecs do.
   * For other codecs, consult the documentation for that algorithm.
   *
   * This threshold is described as a percentage of the target data buffer.
@@ -451,21 +410,21 @@ typedef struct vpx_codec_enc_cfg {
   * bandwidth link, as from a local disk, where higher variations in
   * bitrate are acceptable.
   */
-  enum vpx_rc_mode rc_end_usage;
+  enum aom_rc_mode rc_end_usage;

  /*!\brief Two-pass stats buffer.
   *
   * A buffer containing all of the stats packets produced in the first
   * pass, concatenated.
   */
-  vpx_fixed_buf_t rc_twopass_stats_in;
+  aom_fixed_buf_t rc_twopass_stats_in;

  /*!\brief first pass mb stats buffer.
   *
   * A buffer containing all of the first pass mb stats packets produced
   * in the first pass, concatenated.
   */
-  vpx_fixed_buf_t rc_firstpass_mb_stats_in;
+  aom_fixed_buf_t rc_firstpass_mb_stats_in;

  /*!\brief Target data rate
   *
@@ -483,7 +442,7 @@ typedef struct vpx_codec_enc_cfg {
   * encoded image. The range of valid values for the quantizer is codec
   * specific. Consult the documentation for the codec to determine the
   * values to use. To determine the range programmatically, call
-   * vpx_codec_enc_config_default() with a usage value of 0.
+   * aom_codec_enc_config_default() with a usage value of 0.
   */
  unsigned int rc_min_quantizer;

@@ -493,7 +452,7 @@ typedef struct vpx_codec_enc_cfg {
   * encoded image. The range of valid values for the quantizer is codec
   * specific. Consult the documentation for the codec to determine the
   * values to use. To determine the range programmatically, call
-   * vpx_codec_enc_config_default() with a usage value of 0.
+   * aom_codec_enc_config_default() with a usage value of 0.
   */
  unsigned int rc_max_quantizer;

@@ -503,31 +462,25 @@ typedef struct vpx_codec_enc_cfg {

  /*!\brief Rate control adaptation undershoot control
   *
-   * VP8: Expressed as a percentage of the target bitrate,
+   * This value, expressed as a percentage of the target bitrate,
   * controls the maximum allowed adaptation speed of the codec.
   * This factor controls the maximum amount of bits that can
   * be subtracted from the target bitrate in order to compensate
   * for prior overshoot.
-   * VP9: Expressed as a percentage of the target bitrate, a threshold
-   * undershoot level (current rate vs target) beyond which more agressive
-   * corrective measures are taken.
-   *   *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   *
+   * Valid values in the range 0-1000.
   */
  unsigned int rc_undershoot_pct;

  /*!\brief Rate control adaptation overshoot control
   *
-   * VP8: Expressed as a percentage of the target bitrate,
+   * This value, expressed as a percentage of the target bitrate,
   * controls the maximum allowed adaptation speed of the codec.
   * This factor controls the maximum amount of bits that can
   * be added to the target bitrate in order to compensate for
   * prior undershoot.
-   * VP9: Expressed as a percentage of the target bitrate, a threshold
-   * overshoot level (current rate vs target) beyond which more agressive
-   * corrective measures are taken.
   *
-   * Valid values in the range VP8:0-1000 VP9: 0-100.
+   * Valid values in the range 0-1000.
   */
  unsigned int rc_overshoot_pct;

@@ -592,13 +545,6 @@ typedef struct vpx_codec_enc_cfg {
   */
  unsigned int rc_2pass_vbr_maxsection_pct;

-  /*!\brief Two-pass corpus vbr mode complexity control
-   * Used only in VP9: A value representing the corpus midpoint complexity
-   * for corpus vbr mode. This value defaults to 0 which disables corpus vbr
-   * mode in favour of normal vbr mode.
-   */
-  unsigned int rc_2pass_vbr_corpus_complexity;
-
  /*
   * keyframing settings (kf)
   */
@@ -609,7 +555,7 @@ typedef struct vpx_codec_enc_cfg {
   * fixed interval, or determine the optimal placement automatically
   * (as governed by the #kf_min_dist and #kf_max_dist parameters)
   */
-  enum vpx_kf_mode kf_mode;
+  enum aom_kf_mode kf_mode;

  /*!\brief Keyframe minimum interval
   *
@@ -628,105 +574,12 @@ typedef struct vpx_codec_enc_cfg {
   * equal to kf_max_dist for a fixed interval.
   */
  unsigned int kf_max_dist;
-
-  /*
-   * Spatial scalability settings (ss)
-   */
-
-  /*!\brief Number of spatial coding layers.
-   *
-   * This value specifies the number of spatial coding layers to be used.
-   */
-  unsigned int ss_number_layers;
-
-  /*!\brief Enable auto alt reference flags for each spatial layer.
-   *
-   * These values specify if auto alt reference frame is enabled for each
-   * spatial layer.
-   */
-  int ss_enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
-
-  /*!\brief Target bitrate for each spatial layer.
-   *
-   * These values specify the target coding bitrate to be used for each
-   * spatial layer.
-   */
-  unsigned int ss_target_bitrate[VPX_SS_MAX_LAYERS];
-
-  /*!\brief Number of temporal coding layers.
-   *
-   * This value specifies the number of temporal layers to be used.
-   */
-  unsigned int ts_number_layers;
-
-  /*!\brief Target bitrate for each temporal layer.
-   *
-   * These values specify the target coding bitrate to be used for each
-   * temporal layer.
-   */
-  unsigned int ts_target_bitrate[VPX_TS_MAX_LAYERS];
-
-  /*!\brief Frame rate decimation factor for each temporal layer.
-   *
-   * These values specify the frame rate decimation factors to apply
-   * to each temporal layer.
-   */
-  unsigned int ts_rate_decimator[VPX_TS_MAX_LAYERS];
-
-  /*!\brief Length of the sequence defining frame temporal layer membership.
-   *
-   * This value specifies the length of the sequence that defines the
-   * membership of frames to temporal layers. For example, if the
-   * ts_periodicity = 8, then the frames are assigned to coding layers with a
-   * repeated sequence of length 8.
-   */
-  unsigned int ts_periodicity;
-
-  /*!\brief Template defining the membership of frames to temporal layers.
-   *
-   * This array defines the membership of frames to temporal coding layers.
-   * For a 2-layer encoding that assigns even numbered frames to one temporal
-   * layer (0) and odd numbered frames to a second temporal layer (1) with
-   * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1).
-   */
-  unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY];
-
-  /*!\brief Target bitrate for each spatial/temporal layer.
-   *
-   * These values specify the target coding bitrate to be used for each
-   * spatial/temporal layer.
-   *
-   */
-  unsigned int layer_target_bitrate[VPX_MAX_LAYERS];
-
-  /*!\brief Temporal layering mode indicating which temporal layering scheme to
-   * use.
-   *
-   * The value (refer to VP9E_TEMPORAL_LAYERING_MODE) specifies the
-   * temporal layering mode to use.
-   *
-   */
-  int temporal_layering_mode;
-} vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */
-
-/*!\brief  vp9 svc extra configure parameters
- *
- * This defines max/min quantizers and scale factors for each layer
- *
- */
-typedef struct vpx_svc_parameters {
-  int max_quantizers[VPX_MAX_LAYERS];     /**< Max Q for each layer */
-  int min_quantizers[VPX_MAX_LAYERS];     /**< Min Q for each layer */
-  int scaling_factor_num[VPX_MAX_LAYERS]; /**< Scaling factor-numerator */
-  int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */
-  int speed_per_layer[VPX_MAX_LAYERS];    /**< Speed setting for each sl */
-  int temporal_layering_mode;             /**< Temporal layering mode */
-} vpx_svc_extra_cfg_t;
+} aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */

 /*!\brief Initialize an encoder instance
 *
 * Initializes a encoder context using the given interface. Applications
- * should call the vpx_codec_enc_init convenience macro instead of this
+ * should call the aom_codec_enc_init convenience macro instead of this
 * function directly, to ensure that the ABI version number parameter
 * is properly initialized.
 *
@@ -737,30 +590,30 @@ typedef struct vpx_svc_parameters {
 * \param[in]    ctx     Pointer to this instance's context.
 * \param[in]    iface   Pointer to the algorithm interface to use.
 * \param[in]    cfg     Configuration to use, if known. May be NULL.
- * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    ver     ABI version number. Must be set to
- *                       VPX_ENCODER_ABI_VERSION
- * \retval #VPX_CODEC_OK
+ *                       AOM_ENCODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
 *     The decoder algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
+ * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
-vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
-                                       vpx_codec_iface_t *iface,
-                                       const vpx_codec_enc_cfg_t *cfg,
-                                       vpx_codec_flags_t flags, int ver);
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver);

-/*!\brief Convenience macro for vpx_codec_enc_init_ver()
+/*!\brief Convenience macro for aom_codec_enc_init_ver()
 *
 * Ensures the ABI version parameter is properly set.
 */
-#define vpx_codec_enc_init(ctx, iface, cfg, flags) \
-  vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION)
+#define aom_codec_enc_init(ctx, iface, cfg, flags) \
+  aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION)

 /*!\brief Initialize multi-encoder instance
 *
 * Initializes multi-encoder context using the given interface.
- * Applications should call the vpx_codec_enc_init_multi convenience macro
+ * Applications should call the aom_codec_enc_init_multi convenience macro
 * instead of this function directly, to ensure that the ABI version number
 * parameter is properly initialized.
 *
@@ -768,26 +621,26 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
 * \param[in]    iface   Pointer to the algorithm interface to use.
 * \param[in]    cfg     Configuration to use, if known. May be NULL.
 * \param[in]    num_enc Total number of encoders.
- * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    dsf     Pointer to down-sampling factors.
 * \param[in]    ver     ABI version number. Must be set to
- *                       VPX_ENCODER_ABI_VERSION
- * \retval #VPX_CODEC_OK
+ *                       AOM_ENCODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
 *     The decoder algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
+ * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
-vpx_codec_err_t vpx_codec_enc_init_multi_ver(
-    vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg,
-    int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver);
+aom_codec_err_t aom_codec_enc_init_multi_ver(
+    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
+    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver);

-/*!\brief Convenience macro for vpx_codec_enc_init_multi_ver()
+/*!\brief Convenience macro for aom_codec_enc_init_multi_ver()
 *
 * Ensures the ABI version parameter is properly set.
 */
-#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
-  vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf,   \
-                               VPX_ENCODER_ABI_VERSION)
+#define aom_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
+  aom_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf,   \
+                               AOM_ENCODER_ABI_VERSION)

 /*!\brief Get a default configuration
 *
@@ -799,17 +652,17 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
 *
 * \param[in]    iface     Pointer to the algorithm interface to use.
 * \param[out]   cfg       Configuration buffer to populate.
- * \param[in]    reserved  Must set to 0 for VP8 and VP9.
+ * \param[in]    reserved  Must set to 0 for VP8 and AV1.
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The configuration was populated.
- * \retval #VPX_CODEC_INCAPABLE
+ * \retval #AOM_CODEC_INCAPABLE
 *     Interface is not an encoder interface.
- * \retval #VPX_CODEC_INVALID_PARAM
+ * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, or the usage value was not recognized.
 */
-vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
-                                             vpx_codec_enc_cfg_t *cfg,
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+                                             aom_codec_enc_cfg_t *cfg,
                                             unsigned int reserved);

 /*!\brief Set or change configuration
@@ -819,15 +672,15 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
 * \param[in]    ctx     Pointer to this instance's context
 * \param[in]    cfg     Configuration buffer to use
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The configuration was populated.
- * \retval #VPX_CODEC_INCAPABLE
+ * \retval #AOM_CODEC_INCAPABLE
 *     Interface is not an encoder interface.
- * \retval #VPX_CODEC_INVALID_PARAM
+ * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, or the usage value was not recognized.
 */
-vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
-                                         const vpx_codec_enc_cfg_t *cfg);
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+                                         const aom_codec_enc_cfg_t *cfg);

 /*!\brief Get global stream headers
 *
@@ -840,14 +693,14 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx,
 * \retval Non-NULL
 *     Pointer to buffer containing global header packet
 */
-vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);

-/*!\brief deadline parameter analogous to VPx REALTIME mode. */
-#define VPX_DL_REALTIME (1)
-/*!\brief deadline parameter analogous to  VPx GOOD QUALITY mode. */
-#define VPX_DL_GOOD_QUALITY (1000000)
-/*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */
-#define VPX_DL_BEST_QUALITY (0)
+/*!\brief deadline parameter analogous to AVx REALTIME mode. */
+#define AOM_DL_REALTIME (1)
+/*!\brief deadline parameter analogous to  AVx GOOD QUALITY mode. */
+#define AOM_DL_GOOD_QUALITY (1000000)
+/*!\brief deadline parameter analogous to AVx BEST QUALITY mode. */
+#define AOM_DL_BEST_QUALITY (0)
 /*!\brief Encode a frame
 *
 * Encodes a video frame at the given "presentation time." The presentation
@@ -859,16 +712,16 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 * implicit that limiting the available time to encode will degrade the
 * output quality. The encoder can be given an unlimited time to produce the
 * best possible frame by specifying a deadline of '0'. This deadline
- * supercedes the VPx notion of "best quality, good quality, realtime".
+ * supercedes the AVx notion of "best quality, good quality, realtime".
 * Applications that wish to map these former settings to the new deadline
- * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY,
- * and #VPX_DL_BEST_QUALITY.
+ * based system can use the symbols #AOM_DL_REALTIME, #AOM_DL_GOOD_QUALITY,
+ * and #AOM_DL_BEST_QUALITY.
 *
 * When the last frame has been passed to the encoder, this function should
 * continue to be called, with the img parameter set to NULL. This will
 * signal the end-of-stream condition to the encoder and allow it to encode
- * any held buffers. Encoding is complete when vpx_codec_encode() is called
- * and vpx_codec_get_cx_data() returns no data.
+ * any held buffers. Encoding is complete when aom_codec_encode() is called
+ * and aom_codec_get_cx_data() returns no data.
 *
 * \param[in]    ctx       Pointer to this instance's context
 * \param[in]    img       Image data to encode, NULL to flush.
@@ -877,23 +730,23 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
 * \param[in]    flags     Flags to use for encoding this frame.
 * \param[in]    deadline  Time to spend encoding, in microseconds. (0=infinite)
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The configuration was populated.
- * \retval #VPX_CODEC_INCAPABLE
+ * \retval #AOM_CODEC_INCAPABLE
 *     Interface is not an encoder interface.
- * \retval #VPX_CODEC_INVALID_PARAM
+ * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, the image format is unsupported, etc.
 */
-vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
-                                 vpx_codec_pts_t pts, unsigned long duration,
-                                 vpx_enc_frame_flags_t flags,
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                                 aom_codec_pts_t pts, unsigned long duration,
+                                 aom_enc_frame_flags_t flags,
                                 unsigned long deadline);

 /*!\brief Set compressed data output buffer
 *
 * Sets the buffer that the codec should output the compressed data
 * into. This call effectively sets the buffer pointer returned in the
- * next VPX_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
+ * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
 * appended into this buffer. The buffer is preserved across frames,
 * so applications must periodically call this function after flushing
 * the accumulated compressed data to disk or to the network to reset
@@ -920,20 +773,20 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img,
 * buffer.
 *
 * Applications \ref MUSTNOT call this function during iteration of
- * vpx_codec_get_cx_data().
+ * aom_codec_get_cx_data().
 *
 * \param[in]    ctx         Pointer to this instance's context
 * \param[in]    buf         Buffer to store compressed data into
 * \param[in]    pad_before  Bytes to skip before writing compressed data
 * \param[in]    pad_after   Bytes to skip after writing compressed data
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The buffer was set successfully.
- * \retval #VPX_CODEC_INVALID_PARAM
+ * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, the image format is unsupported, etc.
 */
-vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
-                                          const vpx_fixed_buf_t *buf,
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+                                          const aom_fixed_buf_t *buf,
                                          unsigned int pad_before,
                                          unsigned int pad_after);

@@ -941,17 +794,17 @@ vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
 *
 * Iterates over a list of data packets to be passed from the encoder to the
 * application. The different kinds of packets available are enumerated in
- * #vpx_codec_cx_pkt_kind.
+ * #aom_codec_cx_pkt_kind.
 *
- * #VPX_CODEC_CX_FRAME_PKT packets should be passed to the application's
+ * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's
 * muxer. Multiple compressed frames may be in the list.
- * #VPX_CODEC_STATS_PKT packets should be appended to a global buffer.
+ * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer.
 *
 * The application \ref MUST silently ignore any packet kinds that it does
 * not recognize or support.
 *
 * The data buffers returned from this function are only guaranteed to be
- * valid until the application makes another call to any vpx_codec_* function.
+ * valid until the application makes another call to any aom_codec_* function.
 *
 * \param[in]     ctx      Pointer to this instance's context
 * \param[in,out] iter     Iterator storage, initialized to NULL
@@ -960,8 +813,8 @@ vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx,
 *         two-pass statistics, etc.) or NULL to signal end-of-list.
 *
 */
-const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
-                                                vpx_codec_iter_t *iter);
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+                                                aom_codec_iter_t *iter);

 /*!\brief Get Preview Frame
 *
@@ -975,10 +828,10 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
 *         available.
 *
 */
-const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx);
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx);

 /*!@} - end defgroup encoder*/
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_ENCODER_H_
+#endif  // AOM_AOM_ENCODER_H_
--- a/aom/aom_frame_buffer.h
+++ b/aom/aom_frame_buffer.h
@@ -1,15 +1,16 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

-#ifndef VPX_VPX_FRAME_BUFFER_H_
-#define VPX_VPX_FRAME_BUFFER_H_
+#ifndef AOM_AOM_FRAME_BUFFER_H_
+#define AOM_AOM_FRAME_BUFFER_H_

 /*!\file
 * \brief Describes the decoder external frame buffer interface.
@@ -19,28 +20,28 @@
 extern "C" {
 #endif

-#include "./vpx_integer.h"
+#include "./aom_integer.h"

-/*!\brief The maximum number of work buffers used by libvpx.
+/*!\brief The maximum number of work buffers used by libaom.
 *  Support maximum 4 threads to decode video in parallel.
 *  Each thread will use one work buffer.
 * TODO(hkuang): Add support to set number of worker threads dynamically.
 */
-#define VPX_MAXIMUM_WORK_BUFFERS 8
+#define AOM_MAXIMUM_WORK_BUFFERS 8

-/*!\brief The maximum number of reference buffers that a VP9 encoder may use.
+/*!\brief The maximum number of reference buffers that a AV1 encoder may use.
 */
-#define VP9_MAXIMUM_REF_BUFFERS 8
+#define AOM_MAXIMUM_REF_BUFFERS 8

 /*!\brief External frame buffer
 *
 * This structure holds allocated frame buffers used by the decoder.
 */
-typedef struct vpx_codec_frame_buffer {
+typedef struct aom_codec_frame_buffer {
  uint8_t *data; /**< Pointer to the data buffer */
  size_t size;   /**< Size of data in bytes */
  void *priv;    /**< Frame's private data */
-} vpx_codec_frame_buffer_t;
+} aom_codec_frame_buffer_t;

 /*!\brief get frame buffer callback prototype
 *
@@ -51,17 +52,17 @@ typedef struct vpx_codec_frame_buffer {
 * to the allocated size. The application does not need to align the allocated
 * data. The callback is triggered when the decoder needs a frame buffer to
 * decode a compressed image into. This function may be called more than once
- * for every call to vpx_codec_decode. The application may set fb->priv to
+ * for every call to aom_codec_decode. The application may set fb->priv to
 * some data which will be passed back in the ximage and the release function
 * call. |fb| is guaranteed to not be NULL. On success the callback must
 * return 0. Any failure the callback must return a value less than 0.
 *
 * \param[in] priv         Callback's private data
- * \param[in] min_size     Size in bytes needed by the buffer
- * \param[in,out] fb       Pointer to vpx_codec_frame_buffer_t
+ * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in,out] fb       Pointer to aom_codec_frame_buffer_t
 */
-typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
-                                            vpx_codec_frame_buffer_t *fb);
+typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
+                                            aom_codec_frame_buffer_t *fb);

 /*!\brief release frame buffer callback prototype
 *
@@ -71,13 +72,13 @@ typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
 * a value less than 0.
 *
 * \param[in] priv         Callback's private data
- * \param[in] fb           Pointer to vpx_codec_frame_buffer_t
+ * \param[in] fb           Pointer to aom_codec_frame_buffer_t
 */
-typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv,
-                                                vpx_codec_frame_buffer_t *fb);
+typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv,
+                                                aom_codec_frame_buffer_t *fb);

 #ifdef __cplusplus
 }  // extern "C"
 #endif

-#endif  // VPX_VPX_FRAME_BUFFER_H_
+#endif  // AOM_AOM_FRAME_BUFFER_H_
--- a/aom/aom_image.h
+++ b/aom/aom_image.h
@@ -1,19 +1,20 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

 /*!\file
- * \brief Describes the vpx image descriptor and associated operations
+ * \brief Describes the aom image descriptor and associated operations
 *
 */
-#ifndef VPX_VPX_IMAGE_H_
-#define VPX_VPX_IMAGE_H_
+#ifndef AOM_AOM_IMAGE_H_
+#define AOM_AOM_IMAGE_H_

 #ifdef __cplusplus
 extern "C" {
@@ -27,51 +28,68 @@ extern "C" {
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
-#define VPX_IMAGE_ABI_VERSION (5) /**<\hideinitializer*/
+#define AOM_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/

-#define VPX_IMG_FMT_PLANAR 0x100       /**< Image is a planar format. */
-#define VPX_IMG_FMT_UV_FLIP 0x200      /**< V plane precedes U in memory. */
-#define VPX_IMG_FMT_HAS_ALPHA 0x400    /**< Image has an alpha channel. */
-#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
+#define AOM_IMG_FMT_PLANAR 0x100       /**< Image is a planar format. */
+#define AOM_IMG_FMT_UV_FLIP 0x200      /**< V plane precedes U in memory. */
+#define AOM_IMG_FMT_HAS_ALPHA 0x400    /**< Image has an alpha channel. */
+#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */

 /*!\brief List of supported image formats */
-typedef enum vpx_img_fmt {
-  VPX_IMG_FMT_NONE,
-  VPX_IMG_FMT_YV12 =
-      VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
-  VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2,
-  VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5,
-  VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6,
-  VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7,
-  VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
-  VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
-  VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH,
-  VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
-} vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+typedef enum aom_img_fmt {
+  AOM_IMG_FMT_NONE,
+  AOM_IMG_FMT_RGB24,     /**< 24 bit per pixel packed RGB */
+  AOM_IMG_FMT_RGB32,     /**< 32 bit per pixel packed 0RGB */
+  AOM_IMG_FMT_RGB565,    /**< 16 bit per pixel, 565 */
+  AOM_IMG_FMT_RGB555,    /**< 16 bit per pixel, 555 */
+  AOM_IMG_FMT_UYVY,      /**< UYVY packed YUV */
+  AOM_IMG_FMT_YUY2,      /**< YUYV packed YUV */
+  AOM_IMG_FMT_YVYU,      /**< YVYU packed YUV */
+  AOM_IMG_FMT_BGR24,     /**< 24 bit per pixel packed BGR */
+  AOM_IMG_FMT_RGB32_LE,  /**< 32 bit packed BGR0 */
+  AOM_IMG_FMT_ARGB,      /**< 32 bit packed ARGB, alpha=255 */
+  AOM_IMG_FMT_ARGB_LE,   /**< 32 bit packed BGRA, alpha=255 */
+  AOM_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */
+  AOM_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
+  AOM_IMG_FMT_YV12 =
+      AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
+  AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2,
+  AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP |
+                        3, /** < planar 4:2:0 format with aom color space */
+  AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
+  AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
+  AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
+  AOM_IMG_FMT_I440 = AOM_IMG_FMT_PLANAR | 7,
+  AOM_IMG_FMT_444A = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_HAS_ALPHA | 6,
+  AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_I44016 = AOM_IMG_FMT_I440 | AOM_IMG_FMT_HIGHBITDEPTH
+} aom_img_fmt_t; /**< alias for enum aom_img_fmt */

 /*!\brief List of supported color spaces */
-typedef enum vpx_color_space {
-  VPX_CS_UNKNOWN = 0,   /**< Unknown */
-  VPX_CS_BT_601 = 1,    /**< BT.601 */
-  VPX_CS_BT_709 = 2,    /**< BT.709 */
-  VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */
-  VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */
-  VPX_CS_BT_2020 = 5,   /**< BT.2020 */
-  VPX_CS_RESERVED = 6,  /**< Reserved */
-  VPX_CS_SRGB = 7       /**< sRGB */
-} vpx_color_space_t;    /**< alias for enum vpx_color_space */
+typedef enum aom_color_space {
+  AOM_CS_UNKNOWN = 0,   /**< Unknown */
+  AOM_CS_BT_601 = 1,    /**< BT.601 */
+  AOM_CS_BT_709 = 2,    /**< BT.709 */
+  AOM_CS_SMPTE_170 = 3, /**< SMPTE.170 */
+  AOM_CS_SMPTE_240 = 4, /**< SMPTE.240 */
+  AOM_CS_BT_2020 = 5,   /**< BT.2020 */
+  AOM_CS_RESERVED = 6,  /**< Reserved */
+  AOM_CS_SRGB = 7       /**< sRGB */
+} aom_color_space_t;    /**< alias for enum aom_color_space */

 /*!\brief List of supported color range */
-typedef enum vpx_color_range {
-  VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
-  VPX_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
-} vpx_color_range_t;       /**< alias for enum vpx_color_range */
+typedef enum aom_color_range {
+  AOM_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
+  AOM_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
+} aom_color_range_t;       /**< alias for enum aom_color_range */

 /**\brief Image Descriptor */
-typedef struct vpx_image {
-  vpx_img_fmt_t fmt;       /**< Image Format */
-  vpx_color_space_t cs;    /**< Color Space */
-  vpx_color_range_t range; /**< Color Range */
+typedef struct aom_image {
+  aom_img_fmt_t fmt;       /**< Image Format */
+  aom_color_space_t cs;    /**< Color Space */
+  aom_color_range_t range; /**< Color Range */

  /* Image storage dimensions */
  unsigned int w;         /**< Stored image width */
@@ -91,11 +109,11 @@ typedef struct vpx_image {
  unsigned int y_chroma_shift; /**< subsampling order, Y */

 /* Image data pointers. */
-#define VPX_PLANE_PACKED 0  /**< To be used for all packed formats */
-#define VPX_PLANE_Y 0       /**< Y (Luminance) plane */
-#define VPX_PLANE_U 1       /**< U (Chroma) plane */
-#define VPX_PLANE_V 2       /**< V (Chroma) plane */
-#define VPX_PLANE_ALPHA 3   /**< A (Transparency) plane */
+#define AOM_PLANE_PACKED 0  /**< To be used for all packed formats */
+#define AOM_PLANE_Y 0       /**< Y (Luminance) plane */
+#define AOM_PLANE_U 1       /**< U (Chroma) plane */
+#define AOM_PLANE_V 2       /**< V (Chroma) plane */
+#define AOM_PLANE_ALPHA 3   /**< A (Transparency) plane */
  unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */
  int stride[4];            /**< stride between rows for each plane */

@@ -112,15 +130,15 @@ typedef struct vpx_image {
  int self_allocd;         /**< private */

  void *fb_priv; /**< Frame buffer data associated with the image. */
-} vpx_image_t;   /**< alias for struct vpx_image */
+} aom_image_t;   /**< alias for struct aom_image */

 /**\brief Representation of a rectangle on a surface */
-typedef struct vpx_image_rect {
+typedef struct aom_image_rect {
  unsigned int x;   /**< leftmost column */
  unsigned int y;   /**< topmost row */
  unsigned int w;   /**< width */
  unsigned int h;   /**< height */
-} vpx_image_rect_t; /**< alias for struct vpx_image_rect */
+} aom_image_rect_t; /**< alias for struct aom_image_rect */

 /*!\brief Open a descriptor, allocating storage for the underlying image
 *
@@ -140,7 +158,7 @@ typedef struct vpx_image_rect {
 *         parameter is non-null, the value of the img parameter will be
 *         returned.
 */
-vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
                           unsigned int d_w, unsigned int d_h,
                           unsigned int align);

@@ -163,7 +181,7 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
 *         parameter is non-null, the value of the img parameter will be
 *         returned.
 */
-vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
                          unsigned int d_h, unsigned int align,
                          unsigned char *img_data);

@@ -180,7 +198,7 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
 *
 * \return 0 if the requested rectangle is valid, nonzero otherwise.
 */
-int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
                     unsigned int w, unsigned int h);

 /*!\brief Flip the image vertically (top for bottom)
@@ -190,7 +208,7 @@ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y,
 *
 * \param[in]    img       Image descriptor
 */
-void vpx_img_flip(vpx_image_t *img);
+void aom_img_flip(aom_image_t *img);

 /*!\brief Close an image descriptor
 *
@@ -198,10 +216,10 @@ void vpx_img_flip(vpx_image_t *img);
 *
 * \param[in]    img       Image descriptor
 */
-void vpx_img_free(vpx_image_t *img);
+void aom_img_free(aom_image_t *img);

 #ifdef __cplusplus
 }  // extern "C"
 #endif

-#endif  // VPX_VPX_IMAGE_H_
+#endif  // AOM_AOM_IMAGE_H_
--- a/aom/aom_integer.h
+++ b/aom/aom_integer.h
@@ -1,29 +1,30 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

-#ifndef VPX_VPX_INTEGER_H_
-#define VPX_VPX_INTEGER_H_
+#ifndef AOM_AOM_INTEGER_H_
+#define AOM_AOM_INTEGER_H_

 /* get ptrdiff_t, size_t, wchar_t, NULL */
 #include <stddef.h>

 #if defined(_MSC_VER)
-#define VPX_FORCE_INLINE __forceinline
-#define VPX_INLINE __inline
+#define AOM_FORCE_INLINE __forceinline
+#define AOM_INLINE __inline
 #else
-#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline)
+#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
 // TODO(jbb): Allow a way to force inline off for older compilers.
-#define VPX_INLINE inline
+#define AOM_INLINE inline
 #endif

-#if defined(VPX_EMULATE_INTTYPES)
+#if defined(AOM_EMULATE_INTTYPES)
 typedef signed char int8_t;
 typedef signed short int16_t;
 typedef signed int int32_t;
@@ -60,4 +61,4 @@ typedef size_t uintptr_t;
 #include <inttypes.h>
 #endif

-#endif  // VPX_VPX_INTEGER_H_
+#endif  // AOM_AOM_INTEGER_H_
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -0,0 +1,759 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOMCX_H_
+#define AOM_AOMCX_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include "./aom.h"
+#include "./aom_encoder.h"
+
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
+ *        aom Codec Interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to encode raw AV1 streams.
+ * @{
+ */
+extern aom_codec_iface_t aom_codec_av1_cx_algo;
+extern aom_codec_iface_t *aom_codec_av1_cx(void);
+/*!@} - end algorithm interface member group*/
+
+/*
+ * Algorithm Flags
+ */
+
+/*!\brief Don't reference the last frame
+ *
+ * When this flag is set, the encoder will not use the last frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST (1 << 16)
+
+/*!\brief Don't reference the golden frame
+ *
+ * When this flag is set, the encoder will not use the golden frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * golden frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_GF (1 << 17)
+
+/*!\brief Don't reference the alternate reference frame
+ *
+ * When this flag is set, the encoder will not use the alt ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF (1 << 21)
+
+/*!\brief Don't update the last frame
+ *
+ * When this flag is set, the encoder will not update the last frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_LAST (1 << 18)
+
+/*!\brief Don't update the golden frame
+ *
+ * When this flag is set, the encoder will not update the golden frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_GF (1 << 22)
+
+/*!\brief Don't update the alternate reference frame
+ *
+ * When this flag is set, the encoder will not update the alt ref frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_ARF (1 << 23)
+
+/*!\brief Force golden frame update
+ *
+ * When this flag is set, the encoder copy the contents of the current frame
+ * to the golden frame buffer.
+ */
+#define AOM_EFLAG_FORCE_GF (1 << 19)
+
+/*!\brief Force alternate reference frame update
+ *
+ * When this flag is set, the encoder copy the contents of the current frame
+ * to the alternate reference frame buffer.
+ */
+#define AOM_EFLAG_FORCE_ARF (1 << 24)
+
+/*!\brief Disable entropy update
+ *
+ * When this flag is set, the encoder will not update its internal entropy
+ * model based on the entropy of this frame.
+ */
+#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 20)
+
+/*!\brief AVx encoder control functions
+ *
+ * This set of macros define the control functions available for AVx
+ * encoder interface.
+ *
+ * \sa #aom_codec_control
+ */
+enum aome_enc_control_id {
+  /*!\brief Codec control function to set which reference frame encoder can use.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_USE_REFERENCE = 7,
+
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_ROI_MAP = 8,
+
+  /*!\brief Codec control function to pass an Active map to encoder.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set encoder scaling mode.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_SCALEMODE = 11,
+
+  /*!\brief Codec control function to set encoder internal speed settings.
+   *
+   * Changes in this value influences, among others, the encoder's selection
+   * of motion estimation methods. Values greater than 0 will increase encoder
+   * speed at the expense of quality.
+   *
+   * \note Valid range for VP8: -16..16
+   * \note Valid range for AV1: -8..8
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_CPUUSED = 13,
+
+  /*!\brief Codec control function to enable automatic set and use alf frames.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_ENABLEAUTOALTREF,
+
+#if CONFIG_EXT_REFS
+  /*!\brief Codec control function to enable automatic set and use
+   * bwd-pred frames.
+   *
+   * Supported in codecs: AV1
+   */
+  AOME_SET_ENABLEAUTOBWDREF,
+#endif  // CONFIG_EXT_REFS
+
+  /*!\brief control function to set noise sensitivity
+   *
+   * 0: off, 1: OnYOnly, 2: OnYUV,
+   * 3: OnYUVAggressive, 4: Adaptive
+   *
+   * Supported in codecs: VP8
+   */
+  AOME_SET_NOISE_SENSITIVITY,
+
+  /*!\brief Codec control function to set sharpness.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_SHARPNESS,
+
+  /*!\brief Codec control function to set the threshold for MBs treated static.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_STATIC_THRESHOLD,
+
+  /*!\brief Codec control function to set the number of token partitions.
+   *
+   * Supported in codecs: VP8
+   */
+  AOME_SET_TOKEN_PARTITIONS,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_GET_LAST_QUANTIZER,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+   * parameters.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_GET_LAST_QUANTIZER_64,
+
+  /*!\brief Codec control function to set the max no of frames to create arf.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_ARNR_MAXFRAMES,
+
+  /*!\brief Codec control function to set the filter strength for the arf.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_ARNR_STRENGTH,
+
+  /*!\deprecated control function to set the filter type to use for the arf. */
+  AOME_SET_ARNR_TYPE,
+
+  /*!\brief Codec control function to set visual tuning.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_TUNING,
+
+  /*!\brief Codec control function to set constrained quality level.
+   *
+   * \attention For this value to be used aom_codec_enc_cfg_t::g_usage must be
+   *            set to #AOM_CQ.
+   * \note Valid range: 0..63
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_CQ_LEVEL,
+
+  /*!\brief Codec control function to set Max data rate for Intra frames.
+   *
+   * This value controls additional clamping on the maximum size of a
+   * keyframe. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allocate no more than 4.5 frames worth of bitrate
+   * to a keyframe, set this to 450.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_SET_MAX_INTRA_BITRATE_PCT,
+
+  /*!\brief Codec control function to set reference and update frame flags.
+   *
+   *  Supported in codecs: VP8
+   */
+  AOME_SET_FRAME_FLAGS,
+
+  /*!\brief Codec control function to set max data rate for Inter frames.
+   *
+   * This value controls additional clamping on the maximum size of an
+   * inter frame. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allow no more than 4.5 frames worth of bitrate
+   * to an inter frame, set this to 450.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_MAX_INTER_BITRATE_PCT,
+
+  /*!\brief Boost percentage for Golden Frame in CBR mode.
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * than average frame, set this to 100.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_GF_CBR_BOOST_PCT,
+
+  /*!\brief Codec control function to set encoder screen content mode.
+   *
+   * 0: off, 1: On, 2: On with more aggressive rate control.
+   *
+   * Supported in codecs: VP8
+   */
+  AOME_SET_SCREEN_CONTENT_MODE,
+
+  /*!\brief Codec control function to set lossless encoding mode.
+   *
+   * AV1 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source. This control function provides a mean to switch encoder
+   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+   *                          0 = lossy coding mode
+   *                          1 = lossless coding mode
+   *
+   *  By default, encoder operates in normal coding mode (maybe lossy).
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_LOSSLESS,
+#if CONFIG_AOM_QM
+  /*!\brief Codec control function to encode with quantisation matrices.
+   *
+   * AOM can operate with default quantisation matrices dependent on
+   * quantisation level and block type.
+   *                          0 = do not use quantisation matrices
+   *                          1 = use quantisation matrices
+   *
+   *  By default, the encoder operates without quantisation matrices.
+   *
+   * Supported in codecs: AOM
+   */
+
+  AV1E_SET_ENABLE_QM,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the minimum level of flatness from which the matrices
+   * are determined.
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Supported in codecs: AOM
+   */
+  AV1E_SET_QM_MIN,
+
+  /*!\brief Codec control function to set the max quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the maximum level of flatness possible.
+   *
+   * By default, the encoder sets this maximum at the top of the
+   * available range.
+   *
+   * Supported in codecs: AOM
+   */
+  AV1E_SET_QM_MAX,
+#endif
+
+  /*!\brief Codec control function to set number of tile columns.
+   *
+   * In encoding and decoding, AV1 allows an input image frame be partitioned
+   * into separated vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. This control requests the encoder to use column tiles in
+   * encoding an input frame, with number of tile columns (in Log2 unit) as
+   * the parameter:
+   *             0 = 1 tile column
+   *             1 = 2 tile columns
+   *             2 = 4 tile columns
+   *             .....
+   *             n = 2**n tile columns
+   * The requested tile columns will be capped by encoder based on image size
+   * limitation (The minimum width of a tile column is 256 pixel, the maximum
+   * is 4096).
+   *
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_TILE_COLUMNS,
+
+  /*!\brief Codec control function to set number of tile rows.
+   *
+   * In encoding and decoding, AV1 allows an input image frame be partitioned
+   * into separated horizontal tile rows. Tile rows are encoded or decoded
+   * sequentially. Even though encoding/decoding of later tile rows depends on
+   * earlier ones, this allows the encoder to output data packets for tile rows
+   * prior to completely processing all tile rows in a frame, thereby reducing
+   * the latency in processing between input and output. The parameter
+   * for this control describes the number of tile rows, which has a valid
+   * range [0, 2]:
+   *            0 = 1 tile row
+   *            1 = 2 tile rows
+   *            2 = 4 tile rows
+   *
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_TILE_ROWS,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature.
+   *
+   * AV1 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * By default, this feature is off.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_FRAME_PARALLEL_DECODING,
+
+  /*!\brief Codec control function to set adaptive quantization mode.
+   *
+   * AV1 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_AQ_MODE,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost.
+   *
+   * One AV1 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *               0 = off
+   *               1 = on
+   *
+   * By default, the encoder is allowed to use this feature for appropriate
+   * encoding modes.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_FRAME_PERIODIC_BOOST,
+
+  /*!\brief Codec control function to set noise sensitivity.
+   *
+   *  0: off, 1: On(YOnly)
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_NOISE_SENSITIVITY,
+
+  /*!\brief Codec control function to set content type.
+   * \note Valid parameter range:
+   *              AOM_CONTENT_DEFAULT = Regular video content (Default)
+   *              AOM_CONTENT_SCREEN  = Screen capture content
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_TUNE_CONTENT,
+
+  /*!\brief Codec control function to set color space info.
+   * \note Valid ranges: 0..7, default is "UNKNOWN".
+   *                     0 = UNKNOWN,
+   *                     1 = BT_601
+   *                     2 = BT_709
+   *                     3 = SMPTE_170
+   *                     4 = SMPTE_240
+   *                     5 = BT_2020
+   *                     6 = RESERVED
+   *                     7 = SRGB
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_COLOR_SPACE,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 4.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_MIN_GF_INTERVAL,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 16.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_MAX_GF_INTERVAL,
+
+  /*!\brief Codec control function to get an Active map back from the encoder.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_GET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set color range bit.
+   * \note Valid ranges: 0..1, default is 0
+   *                     0 = Limited range (16..235 or HBD equivalent)
+   *                     1 = Full range (0..255 or HBD equivalent)
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_COLOR_RANGE,
+
+  /*!\brief Codec control function to set intended rendering image size.
+   *
+   * By default, this is identical to the image size in pixels.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_RENDER_SIZE,
+
+  /*!\brief Codec control function to set target level.
+   *
+   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
+   * 11: target for level 1.1; ... 62: target for level 6.2
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_TARGET_LEVEL,
+
+  /*!\brief Codec control function to get bitstream level.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_GET_LEVEL,
+
+  /*!\brief Codec control function to set intended superblock size.
+   *
+   * By default, the superblock size is determined separately for each
+   * frame by the encoder.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_SUPERBLOCK_SIZE,
+};
+
+/*!\brief aom 1-D scaling mode
+ *
+ * This set of constants define 1-D aom scaling modes
+ */
+typedef enum aom_scaling_mode_1d {
+  AOME_NORMAL = 0,
+  AOME_FOURFIVE = 1,
+  AOME_THREEFIVE = 2,
+  AOME_ONETWO = 3
+} AOM_SCALING_MODE;
+
+/*!\brief  aom region of interest map
+ *
+ * These defines the data structures for the region of interest map
+ *
+ */
+
+typedef struct aom_roi_map {
+  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  unsigned char *roi_map;
+  unsigned int rows; /**< Number of rows. */
+  unsigned int cols; /**< Number of columns. */
+  // TODO(paulwilkins): broken for AV1 which has 8 segments
+  // q and loop filter deltas for each segment
+  // (see MAX_MB_SEGMENTS)
+  int delta_q[4];  /**< Quantizer deltas. */
+  int delta_lf[4]; /**< Loop filter deltas. */
+  /*! Static breakout threshold for each segment. */
+  unsigned int static_threshold[4];
+} aom_roi_map_t;
+
+/*!\brief  aom active region map
+ *
+ * These defines the data structures for active region map
+ *
+ */
+
+typedef struct aom_active_map {
+  /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
+  unsigned char *active_map;
+  unsigned int rows; /**< number of rows */
+  unsigned int cols; /**< number of cols */
+} aom_active_map_t;
+
+/*!\brief  aom image scaling mode
+ *
+ * This defines the data structure for image scaling mode
+ *
+ */
+typedef struct aom_scaling_mode {
+  AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
+  AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode   */
+} aom_scaling_mode_t;
+
+/*!\brief VP8 token partition mode
+ *
+ * This defines VP8 partitioning mode for compressed data, i.e., the number of
+ * sub-streams in the bitstream. Used for parallelized decoding.
+ *
+ */
+
+typedef enum {
+  AOM_ONE_TOKENPARTITION = 0,
+  AOM_TWO_TOKENPARTITION = 1,
+  AOM_FOUR_TOKENPARTITION = 2,
+  AOM_EIGHT_TOKENPARTITION = 3
+} aome_token_partitions;
+
+/*!brief AV1 encoder content type */
+typedef enum {
+  AOM_CONTENT_DEFAULT,
+  AOM_CONTENT_SCREEN,
+  AOM_CONTENT_INVALID
+} aom_tune_content;
+
+/*!\brief VP8 model tuning parameters
+ *
+ * Changes the encoder to tune for certain types of input material.
+ *
+ */
+typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
+
+/*!\cond */
+/*!\brief VP8 encoder control function parameter type
+ *
+ * Defines the data types that VP8E control functions take. Note that
+ * additional common controls are defined in aom.h
+ *
+ */
+
+AOM_CTRL_USE_TYPE_DEPRECATED(AOME_USE_REFERENCE, int)
+#define AOM_CTRL_AOME_USE_REFERENCE
+AOM_CTRL_USE_TYPE(AOME_SET_FRAME_FLAGS, int)
+#define AOM_CTRL_AOME_SET_FRAME_FLAGS
+AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
+#define AOM_CTRL_AOME_SET_ROI_MAP
+AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AOME_SET_ACTIVEMAP
+AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
+#define AOM_CTRL_AOME_SET_SCALEMODE
+
+AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
+#define AOM_CTRL_AOME_SET_CPUUSED
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
+
+#if CONFIG_EXT_REFS
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
+#endif  // CONFIG_EXT_REFS
+
+AOM_CTRL_USE_TYPE(AOME_SET_NOISE_SENSITIVITY, unsigned int)
+#define AOM_CTRL_AOME_SET_NOISE_SENSITIVITY
+AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
+#define AOM_CTRL_AOME_SET_SHARPNESS
+AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
+#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
+AOM_CTRL_USE_TYPE(AOME_SET_TOKEN_PARTITIONS, int) /* aome_token_partitions */
+#define AOM_CTRL_AOME_SET_TOKEN_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_STRENGTH
+AOM_CTRL_USE_TYPE_DEPRECATED(AOME_SET_ARNR_TYPE, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_TYPE
+AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
+#define AOM_CTRL_AOME_SET_TUNING
+AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
+#define AOM_CTRL_AOME_SET_CQ_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
+#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
+#define AOM_CTRL_AV1E_SET_TILE_ROWS
+
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
+
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AOME_SET_SCREEN_CONTENT_MODE, unsigned int)
+#define AOM_CTRL_AOME_SET_SCREEN_CONTENT_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
+#define AOM_CTRL_AV1E_SET_LOSSLESS
+
+#if CONFIG_AOM_QM
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_QM
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MIN
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MAX
+#endif
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
+
+AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_AQ_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
+
+AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
+#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
+#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_SPACE, int)
+#define AOM_CTRL_AV1E_SET_COLOR_SPACE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AV1E_GET_ACTIVEMAP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
+#define AOM_CTRL_AV1E_SET_COLOR_RANGE
+
+/*!\brief
+ *
+ * TODO(rbultje) : add support of the control in ffmpeg
+ */
+#define AOM_CTRL_AV1E_SET_RENDER_SIZE
+AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
+#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
+#define AOM_CTRL_AV1E_GET_LEVEL
+/*!\endcond */
+/*! @} - end defgroup vp8_encoder */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOMCX_H_
--- a/aom/aomdx.h
+++ b/aom/aomdx.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
+ * \ingroup aom
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 within the aom Decoder
+ *        interface.
+ */
+#ifndef AOM_AOMDX_H_
+#define AOM_AOMDX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Include controls common to both the encoder and decoder */
+#include "./aom.h"
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to decode AV1 streams.
+ * @{
+ */
+extern aom_codec_iface_t aom_codec_av1_dx_algo;
+extern aom_codec_iface_t *aom_codec_av1_dx(void);
+/*!@} - end algorithm interface member group*/
+
+/** Data structure that stores bit accounting for debug
+ */
+typedef struct Accounting Accounting;
+
+/*!\enum aom_dec_control_id
+ * \brief AOM decoder control functions
+ *
+ * This set of macros define the control functions available for the AOM
+ * decoder interface.
+ *
+ * \sa #aom_codec_control
+ */
+enum aom_dec_control_id {
+  /** control function to get info on which reference frames were updated
+   *  by the last decode
+   */
+  AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
+
+  /** check if the indicated frame is corrupted */
+  AOMD_GET_FRAME_CORRUPTED,
+
+  /** control function to get info on which reference frames were used
+   *  by the last decode
+   */
+  AOMD_GET_LAST_REF_USED,
+
+  /** decryption function to decrypt encoded buffer data immediately
+   * before decoding. Takes a aom_decrypt_init, which contains
+   * a callback function and opaque context pointer.
+   */
+  AOMD_SET_DECRYPTOR,
+  // AOMD_SET_DECRYPTOR = AOMD_SET_DECRYPTOR,
+
+  /** control function to get the dimensions that the current frame is decoded
+   * at. This may be different to the intended display size for the frame as
+   * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
+  AV1D_GET_FRAME_SIZE,
+
+  /** control function to get the current frame's intended display dimensions
+   * (as specified in the wrapper or frame header). This may be different to
+   * the decoded dimensions of this frame (see AV1D_GET_FRAME_SIZE). */
+  AV1D_GET_DISPLAY_SIZE,
+
+  /** control function to get the bit depth of the stream. */
+  AV1D_GET_BIT_DEPTH,
+
+  /** control function to set the byte alignment of the planes in the reference
+   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
+   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+   * follows Y plane, and V plane directly follows U plane. Default value is 0.
+   */
+  AV1_SET_BYTE_ALIGNMENT,
+
+  /** control function to invert the decoding order to from right to left. The
+   * function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired.
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
+  AV1_INVERT_TILE_DECODE_ORDER,
+
+  /** control function to set the skip loop filter flag. Valid values are
+   * integers. The decoder will skip the loop filter when its value is set to
+   * nonzero. If the loop filter is skipped the decoder may accumulate decode
+   * artifacts. The default value is 0.
+   */
+  AV1_SET_SKIP_LOOP_FILTER,
+
+  /** control function to retrieve a pointer to the Accounting struct.  When
+   * compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
+   * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
+   * The caller should ensure that AOM_CODEC_OK is returned before attempting
+   * to dereference the Accounting pointer.
+   */
+  AV1_GET_ACCOUNTING,
+
+  AOM_DECODER_CTRL_ID_MAX,
+
+  /** control function to set the range of tile decoding. A value that is
+   * greater and equal to zero indicates only the specific row/column is
+   * decoded. A value that is -1 indicates the whole row/column is decoded.
+   * A special case is both values are -1 that means the whole frame is
+   * decoded.
+   */
+  AV1_SET_DECODE_TILE_ROW,
+  AV1_SET_DECODE_TILE_COL
+};
+
+/** Decrypt n bytes of data from input -> output, using the decrypt_state
+ *  passed in AOMD_SET_DECRYPTOR.
+ */
+typedef void (*aom_decrypt_cb)(void *decrypt_state, const unsigned char *input,
+                               unsigned char *output, int count);
+
+/*!\brief Structure to hold decryption state
+ *
+ * Defines a structure to hold the decryption state and access function.
+ */
+typedef struct aom_decrypt_init {
+  /*! Decrypt callback. */
+  aom_decrypt_cb decrypt_cb;
+
+  /*! Decryption state. */
+  void *decrypt_state;
+} aom_decrypt_init;
+
+/*!\brief A deprecated alias for aom_decrypt_init.
+ */
+typedef aom_decrypt_init aom_decrypt_init;
+
+/*!\cond */
+/*!\brief AOM decoder control function parameter type
+ *
+ * Defines the data types that AOMD control functions take. Note that
+ * additional common controls are defined in aom.h
+ *
+ */
+
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
+AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
+#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_USED
+AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
+#define AOM_CTRL_AOMD_SET_DECRYPTOR
+// AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
+//#define AOM_CTRL_AOMD_SET_DECRYPTOR
+AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
+AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
+#define AOM_CTRL_AV1D_GET_BIT_DEPTH
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_FRAME_SIZE
+AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
+#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
+AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
+#define AOM_CTRL_AV1_GET_ACCOUNTING
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
+/*!\endcond */
+/*! @} - end defgroup aom_decoder */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOMDX_H_
--- a/aom/exports_com
+++ b/aom/exports_com
@@ -0,0 +1,16 @@
+text aom_codec_build_config
+text aom_codec_control_
+text aom_codec_destroy
+text aom_codec_err_to_string
+text aom_codec_error
+text aom_codec_error_detail
+text aom_codec_get_caps
+text aom_codec_iface_name
+text aom_codec_version
+text aom_codec_version_extra_str
+text aom_codec_version_str
+text aom_img_alloc
+text aom_img_flip
+text aom_img_free
+text aom_img_set_rect
+text aom_img_wrap
--- a/aom/exports_dec
+++ b/aom/exports_dec
@@ -0,0 +1,8 @@
+text aom_codec_dec_init_ver
+text aom_codec_decode
+text aom_codec_get_frame
+text aom_codec_get_stream_info
+text aom_codec_peek_stream_info
+text aom_codec_register_put_frame_cb
+text aom_codec_register_put_slice_cb
+text aom_codec_set_frame_buffer_functions
--- a/aom/exports_enc
+++ b/aom/exports_enc
@@ -0,0 +1,9 @@
+text aom_codec_enc_config_default
+text aom_codec_enc_config_set
+text aom_codec_enc_init_multi_ver
+text aom_codec_enc_init_ver
+text aom_codec_encode
+text aom_codec_get_cx_data
+text aom_codec_get_global_headers
+text aom_codec_get_preview_frame
+text aom_codec_set_cx_data_buf
--- a/aom/internal/aom_codec_internal.h
+++ b/aom/internal/aom_codec_internal.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

 /*!\file
@@ -19,31 +20,32 @@
 * into the global namespace:
 *     <pre>
 *     my_codec.c:
- *       vpx_codec_iface_t my_codec = {
+ *       aom_codec_iface_t my_codec = {
 *           "My Codec v1.0",
- *           VPX_CODEC_ALG_ABI_VERSION,
+ *           AOM_CODEC_ALG_ABI_VERSION,
 *           ...
 *       };
 *     </pre>
 *
 * An application instantiates a specific decoder instance by using
- * vpx_codec_init() and a pointer to the algorithm's interface structure:
+ * aom_codec_init() and a pointer to the algorithm's interface structure:
 *     <pre>
 *     my_app.c:
- *       extern vpx_codec_iface_t my_codec;
+ *       extern aom_codec_iface_t my_codec;
 *       {
- *           vpx_codec_ctx_t algo;
- *           res = vpx_codec_init(&algo, &my_codec);
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
 *       }
 *     </pre>
 *
 * Once initialized, the instance is manged using other functions from
- * the vpx_codec_* family.
+ * the aom_codec_* family.
 */
-#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
-#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
-#include "../vpx_decoder.h"
-#include "../vpx_encoder.h"
+#ifndef AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#define AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#include "./aom_config.h"
+#include "../aom_decoder.h"
+#include "../aom_encoder.h"
 #include <stdarg.h>

 #ifdef __cplusplus
@@ -58,46 +60,46 @@ extern "C" {
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
-#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
+#define AOM_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/

-typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t;
-typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
+typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
+typedef struct aom_codec_priv_enc_mr_cfg aom_codec_priv_enc_mr_cfg_t;

 /*!\brief init function pointer prototype
 *
 * Performs algorithm-specific initialization of the decoder context. This
- * function is called by the generic vpx_codec_init() wrapper function, so
+ * function is called by the generic aom_codec_init() wrapper function, so
 * plugins implementing this interface may trust the input parameters to be
 * properly initialized.
 *
 * \param[in] ctx   Pointer to this instance's context
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The input stream was recognized and decoder initialized.
- * \retval #VPX_CODEC_MEM_ERROR
+ * \retval #AOM_CODEC_MEM_ERROR
 *     Memory operation failed.
 */
-typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(
-    vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data);
+typedef aom_codec_err_t (*aom_codec_init_fn_t)(
+    aom_codec_ctx_t *ctx, aom_codec_priv_enc_mr_cfg_t *data);

 /*!\brief destroy function pointer prototype
 *
 * Performs algorithm-specific destruction of the decoder context. This
- * function is called by the generic vpx_codec_destroy() wrapper function,
+ * function is called by the generic aom_codec_destroy() wrapper function,
 * so plugins implementing this interface may trust the input parameters
 * to be properly initialized.
 *
 * \param[in] ctx   Pointer to this instance's context
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The input stream was recognized and decoder initialized.
- * \retval #VPX_CODEC_MEM_ERROR
+ * \retval #AOM_CODEC_MEM_ERROR
 *     Memory operation failed.
 */
-typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx);
+typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx);

 /*!\brief parse stream info function pointer prototype
 *
 * Performs high level parsing of the bitstream. This function is called by the
- * generic vpx_codec_peek_stream_info() wrapper function, so plugins
+ * generic aom_codec_peek_stream_info() wrapper function, so plugins
 * implementing this interface may trust the input parameters to be properly
 * initialized.
 *
@@ -108,12 +110,12 @@ typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx);
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
-typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data,
+typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
                                                  unsigned int data_sz,
-                                                  vpx_codec_stream_info_t *si);
+                                                  aom_codec_stream_info_t *si);

 /*!\brief Return information about the current stream.
 *
@@ -125,11 +127,11 @@ typedef vpx_codec_err_t (*vpx_codec_peek_si_fn_t)(const uint8_t *data,
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
-typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx,
-                                                 vpx_codec_stream_info_t *si);
+typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 aom_codec_stream_info_t *si);

 /*!\brief control function pointer prototype
 *
@@ -137,7 +139,7 @@ typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx,
 * instance. This can be used to implement features specific to a particular
 * algorithm.
 *
- * This function is called by the generic vpx_codec_control() wrapper
+ * This function is called by the generic aom_codec_control() wrapper
 * function, so plugins implementing this interface may trust the input
 * parameters to be properly initialized. However,  this interface does not
 * provide type safety for the exchanged data or assign meanings to the
@@ -150,49 +152,49 @@ typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx,
 * \param[in]     ctrl_id          Algorithm specific control identifier
 * \param[in,out] data             Data to exchange with algorithm instance.
 *
- * \retval #VPX_CODEC_OK
+ * \retval #AOM_CODEC_OK
 *     The internal state data was deserialized.
 */
-typedef vpx_codec_err_t (*vpx_codec_control_fn_t)(vpx_codec_alg_priv_t *ctx,
+typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx,
                                                  va_list ap);

 /*!\brief control function pointer mapping
 *
 * This structure stores the mapping between control identifiers and
 * implementing functions. Each algorithm provides a list of these
- * mappings. This list is searched by the vpx_codec_control() wrapper
+ * mappings. This list is searched by the aom_codec_control() wrapper
 * function to determine which function to invoke. The special
 * value {0, NULL} is used to indicate end-of-list, and must be
 * present. The special value {0, <non-null>} can be used as a catch-all
 * mapping. This implies that ctrl_id values chosen by the algorithm
 * \ref MUST be non-zero.
 */
-typedef const struct vpx_codec_ctrl_fn_map {
+typedef const struct aom_codec_ctrl_fn_map {
  int ctrl_id;
-  vpx_codec_control_fn_t fn;
-} vpx_codec_ctrl_fn_map_t;
+  aom_codec_control_fn_t fn;
+} aom_codec_ctrl_fn_map_t;

 /*!\brief decode data function pointer prototype
 *
 * Processes a buffer of coded data. If the processing results in a new
- * decoded frame becoming available, #VPX_CODEC_CB_PUT_SLICE and
- * #VPX_CODEC_CB_PUT_FRAME events are generated as appropriate. This
- * function is called by the generic vpx_codec_decode() wrapper function,
+ * decoded frame becoming available, #AOM_CODEC_CB_PUT_SLICE and
+ * #AOM_CODEC_CB_PUT_FRAME events are generated as appropriate. This
+ * function is called by the generic aom_codec_decode() wrapper function,
 * so plugins implementing this interface may trust the input parameters
 * to be properly initialized.
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] data         Pointer to this block of new coded data. If
- *                         NULL, a #VPX_CODEC_CB_PUT_FRAME event is posted
+ *                         NULL, a #AOM_CODEC_CB_PUT_FRAME event is posted
 *                         for the previously decoded frame.
 * \param[in] data_sz      Size of the coded data, in bytes.
 *
- * \return Returns #VPX_CODEC_OK if the coded data was processed completely
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
 *         and future pictures can be decoded without error. Otherwise,
- *         see the descriptions of the other error codes in ::vpx_codec_err_t
+ *         see the descriptions of the other error codes in ::aom_codec_err_t
 *         for recoverability capabilities.
 */
-typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx,
+typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx,
                                                 const uint8_t *data,
                                                 unsigned int data_sz,
                                                 void *user_priv,
@@ -205,8 +207,8 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx,
 * complete when this function returns NULL.
 *
 * The list of available frames becomes valid upon completion of the
- * vpx_codec_decode call, and remains valid until the next call to
- * vpx_codec_decode.
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
 *
 * \param[in]     ctx      Pointer to this instance's context
 * \param[in out] iter     Iterator storage, initialized to NULL
@@ -214,15 +216,15 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx,
 * \return Returns a pointer to an image, if one is ready for display. Frames
 *         produced will always be in PTS (presentation time stamp) order.
 */
-typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
-                                                 vpx_codec_iter_t *iter);
+typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 aom_codec_iter_t *iter);

 /*!\brief Pass in external frame buffers for the decoder to use.
 *
- * Registers functions to be called when libvpx needs a frame buffer
- * to decode the current frame and a function to be called when libvpx does
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
 * not internally reference the frame buffer. This set function must
- * be called before the first call to decode or libvpx will assume the
+ * be called before the first call to decode or libaom will assume the
 * default behavior of allocating frame buffers internally.
 *
 * \param[in] ctx          Pointer to this instance's context
@@ -230,103 +232,103 @@ typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
 * \param[in] cb_release   Pointer to the release callback function
 * \param[in] cb_priv      Callback's private data
 *
- * \retval #VPX_CODEC_OK
- *     External frame buffers will be used by libvpx.
- * \retval #VPX_CODEC_INVALID_PARAM
+ * \retval #AOM_CODEC_OK
+ *     External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
 *     One or more of the callbacks were NULL.
- * \retval #VPX_CODEC_ERROR
+ * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     using external frame buffers.
 *
 * \note
- * When decoding VP9, the application may be required to pass in at least
- * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame
 * buffers.
 */
-typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)(
-    vpx_codec_alg_priv_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
-    vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)(
+    aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);

-typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx,
-                                                 const vpx_image_t *img,
-                                                 vpx_codec_pts_t pts,
+typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 const aom_image_t *img,
+                                                 aom_codec_pts_t pts,
                                                 unsigned long duration,
-                                                 vpx_enc_frame_flags_t flags,
+                                                 aom_enc_frame_flags_t flags,
                                                 unsigned long deadline);
-typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)(
-    vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter);
+typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)(
+    aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter);

-typedef vpx_codec_err_t (*vpx_codec_enc_config_set_fn_t)(
-    vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg);
-typedef vpx_fixed_buf_t *(*vpx_codec_get_global_headers_fn_t)(
-    vpx_codec_alg_priv_t *ctx);
+typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)(
+    aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg);
+typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
+    aom_codec_alg_priv_t *ctx);

-typedef vpx_image_t *(*vpx_codec_get_preview_frame_fn_t)(
-    vpx_codec_alg_priv_t *ctx);
+typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
+    aom_codec_alg_priv_t *ctx);

-typedef vpx_codec_err_t (*vpx_codec_enc_mr_get_mem_loc_fn_t)(
-    const vpx_codec_enc_cfg_t *cfg, void **mem_loc);
+typedef aom_codec_err_t (*aom_codec_enc_mr_get_mem_loc_fn_t)(
+    const aom_codec_enc_cfg_t *cfg, void **mem_loc);

 /*!\brief usage configuration mapping
 *
 * This structure stores the mapping between usage identifiers and
 * configuration structures. Each algorithm provides a list of these
- * mappings. This list is searched by the vpx_codec_enc_config_default()
+ * mappings. This list is searched by the aom_codec_enc_config_default()
 * wrapper function to determine which config to return. The special value
 * {-1, {0}} is used to indicate end-of-list, and must be present. At least
 * one mapping must be present, in addition to the end-of-list.
 *
 */
-typedef const struct vpx_codec_enc_cfg_map {
+typedef const struct aom_codec_enc_cfg_map {
  int usage;
-  vpx_codec_enc_cfg_t cfg;
-} vpx_codec_enc_cfg_map_t;
+  aom_codec_enc_cfg_t cfg;
+} aom_codec_enc_cfg_map_t;

 /*!\brief Decoder algorithm interface interface
 *
 * All decoders \ref MUST expose a variable of this type.
 */
-struct vpx_codec_iface {
+struct aom_codec_iface {
  const char *name;                   /**< Identification String  */
  int abi_version;                    /**< Implemented ABI version */
-  vpx_codec_caps_t caps;              /**< Decoder capabilities */
-  vpx_codec_init_fn_t init;           /**< \copydoc ::vpx_codec_init_fn_t */
-  vpx_codec_destroy_fn_t destroy;     /**< \copydoc ::vpx_codec_destroy_fn_t */
-  vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */
-  struct vpx_codec_dec_iface {
-    vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */
-    vpx_codec_get_si_fn_t get_si;   /**< \copydoc ::vpx_codec_get_si_fn_t */
-    vpx_codec_decode_fn_t decode;   /**< \copydoc ::vpx_codec_decode_fn_t */
-    vpx_codec_get_frame_fn_t
-        get_frame;                   /**< \copydoc ::vpx_codec_get_frame_fn_t */
-    vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */
+  aom_codec_caps_t caps;              /**< Decoder capabilities */
+  aom_codec_init_fn_t init;           /**< \copydoc ::aom_codec_init_fn_t */
+  aom_codec_destroy_fn_t destroy;     /**< \copydoc ::aom_codec_destroy_fn_t */
+  aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */
+  struct aom_codec_dec_iface {
+    aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */
+    aom_codec_get_si_fn_t get_si;   /**< \copydoc ::aom_codec_get_si_fn_t */
+    aom_codec_decode_fn_t decode;   /**< \copydoc ::aom_codec_decode_fn_t */
+    aom_codec_get_frame_fn_t
+        get_frame;                   /**< \copydoc ::aom_codec_get_frame_fn_t */
+    aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */
  } dec;
-  struct vpx_codec_enc_iface {
+  struct aom_codec_enc_iface {
    int cfg_map_count;
-    vpx_codec_enc_cfg_map_t
-        *cfg_maps;                /**< \copydoc ::vpx_codec_enc_cfg_map_t */
-    vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */
-    vpx_codec_get_cx_data_fn_t
-        get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
-    vpx_codec_enc_config_set_fn_t
-        cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */
-    vpx_codec_get_global_headers_fn_t
-        get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */
-    vpx_codec_get_preview_frame_fn_t
-        get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */
-    vpx_codec_enc_mr_get_mem_loc_fn_t
-        mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */
+    aom_codec_enc_cfg_map_t
+        *cfg_maps;                /**< \copydoc ::aom_codec_enc_cfg_map_t */
+    aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */
+    aom_codec_get_cx_data_fn_t
+        get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */
+    aom_codec_enc_config_set_fn_t
+        cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */
+    aom_codec_get_global_headers_fn_t
+        get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */
+    aom_codec_get_preview_frame_fn_t
+        get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
+    aom_codec_enc_mr_get_mem_loc_fn_t
+        mr_get_mem_loc; /**< \copydoc ::aom_codec_enc_mr_get_mem_loc_fn_t */
  } enc;
 };

 /*!\brief Callback function pointer / user data pair storage */
-typedef struct vpx_codec_priv_cb_pair {
+typedef struct aom_codec_priv_cb_pair {
  union {
-    vpx_codec_put_frame_cb_fn_t put_frame;
-    vpx_codec_put_slice_cb_fn_t put_slice;
+    aom_codec_put_frame_cb_fn_t put_frame;
+    aom_codec_put_slice_cb_fn_t put_slice;
  } u;
  void *user_priv;
-} vpx_codec_priv_cb_pair_t;
+} aom_codec_priv_cb_pair_t;

 /*!\brief Instance private storage
 *
@@ -336,18 +338,18 @@ typedef struct vpx_codec_priv_cb_pair {
 * structure can be made the first member of the algorithm specific structure,
 * and the pointer cast to the proper type.
 */
-struct vpx_codec_priv {
+struct aom_codec_priv {
  const char *err_detail;
-  vpx_codec_flags_t init_flags;
+  aom_codec_flags_t init_flags;
  struct {
-    vpx_codec_priv_cb_pair_t put_frame_cb;
-    vpx_codec_priv_cb_pair_t put_slice_cb;
+    aom_codec_priv_cb_pair_t put_frame_cb;
+    aom_codec_priv_cb_pair_t put_slice_cb;
  } dec;
  struct {
-    vpx_fixed_buf_t cx_data_dst_buf;
+    aom_fixed_buf_t cx_data_dst_buf;
    unsigned int cx_data_pad_before;
    unsigned int cx_data_pad_after;
-    vpx_codec_cx_pkt_t cx_data_pkt;
+    aom_codec_cx_pkt_t cx_data_pkt;
    unsigned int total_encoders;
  } enc;
 };
@@ -355,20 +357,20 @@ struct vpx_codec_priv {
 /*
 * Multi-resolution encoding internal configuration
 */
-struct vpx_codec_priv_enc_mr_cfg {
+struct aom_codec_priv_enc_mr_cfg {
  unsigned int mr_total_resolutions;
  unsigned int mr_encoder_id;
-  struct vpx_rational mr_down_sampling_factor;
+  struct aom_rational mr_down_sampling_factor;
  void *mr_low_res_mode_info;
 };

-#undef VPX_CTRL_USE_TYPE
-#define VPX_CTRL_USE_TYPE(id, typ) \
-  static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
+#undef AOM_CTRL_USE_TYPE
+#define AOM_CTRL_USE_TYPE(id, typ) \
+  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }

-#undef VPX_CTRL_USE_TYPE_DEPRECATED
-#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
-  static VPX_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
+#undef AOM_CTRL_USE_TYPE_DEPRECATED
+#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ) \
+  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }

 #define CAST(id, arg) id##__value(arg)

@@ -382,44 +384,44 @@ struct vpx_codec_priv_enc_mr_cfg {
 * macro is provided to define this getter function automatically.
 */
 #define CODEC_INTERFACE(id)                          \
-  vpx_codec_iface_t *id(void) { return &id##_algo; } \
-  vpx_codec_iface_t id##_algo
+  aom_codec_iface_t *id(void) { return &id##_algo; } \
+  aom_codec_iface_t id##_algo

 /* Internal Utility Functions
 *
 * The following functions are intended to be used inside algorithms as
- * utilities for manipulating vpx_codec_* data structures.
+ * utilities for manipulating aom_codec_* data structures.
 */
-struct vpx_codec_pkt_list {
+struct aom_codec_pkt_list {
  unsigned int cnt;
  unsigned int max;
-  struct vpx_codec_cx_pkt pkts[1];
+  struct aom_codec_cx_pkt pkts[1];
 };

-#define vpx_codec_pkt_list_decl(n)     \
+#define aom_codec_pkt_list_decl(n)     \
  union {                              \
-    struct vpx_codec_pkt_list head;    \
+    struct aom_codec_pkt_list head;    \
    struct {                           \
-      struct vpx_codec_pkt_list head;  \
-      struct vpx_codec_cx_pkt pkts[n]; \
+      struct aom_codec_pkt_list head;  \
+      struct aom_codec_cx_pkt pkts[n]; \
    } alloc;                           \
  }

-#define vpx_codec_pkt_list_init(m) \
+#define aom_codec_pkt_list_init(m) \
  (m)->alloc.head.cnt = 0,         \
  (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])

-int vpx_codec_pkt_list_add(struct vpx_codec_pkt_list *,
-                           const struct vpx_codec_cx_pkt *);
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *,
+                           const struct aom_codec_cx_pkt *);

-const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(
-    struct vpx_codec_pkt_list *list, vpx_codec_iter_t *iter);
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter);

 #include <stdio.h>
 #include <setjmp.h>

-struct vpx_internal_error_info {
-  vpx_codec_err_t error_code;
+struct aom_internal_error_info {
+  aom_codec_err_t error_code;
  int has_detail;
  char detail[80];
  int setjmp;
@@ -434,12 +436,30 @@ struct vpx_internal_error_info {
 #endif
 #endif

-void vpx_internal_error(struct vpx_internal_error_info *info,
-                        vpx_codec_err_t error, const char *fmt,
+void aom_internal_error(struct aom_internal_error_info *info,
+                        aom_codec_err_t error, const char *fmt,
                        ...) CLANG_ANALYZER_NORETURN;

+#if CONFIG_DEBUG
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr)                         \
+  do {                                                                      \
+    lval = (expr);                                                          \
+    if (!lval)                                                              \
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,                   \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr)       \
+  do {                                                    \
+    lval = (expr);                                        \
+    if (!lval)                                            \
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
+                         "Failed to allocate " #lval);    \
+  } while (0)
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif

-#endif  // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#endif  // AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
--- a/aom/src/aom_codec.c
+++ b/aom/src/aom_codec.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <stdarg.h>
+#include <stdlib.h>
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_version.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+int aom_codec_version(void) { return VERSION_PACKED; }
+
+const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
+
+const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
+
+const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
+  return iface ? iface->name : "<invalid interface>";
+}
+
+const char *aom_codec_err_to_string(aom_codec_err_t err) {
+  switch (err) {
+    case AOM_CODEC_OK: return "Success";
+    case AOM_CODEC_ERROR: return "Unspecified internal error";
+    case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
+    case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
+    case AOM_CODEC_INCAPABLE:
+      return "Codec does not implement requested capability";
+    case AOM_CODEC_UNSUP_BITSTREAM:
+      return "Bitstream not supported by this decoder";
+    case AOM_CODEC_UNSUP_FEATURE:
+      return "Bitstream required feature not supported by this decoder";
+    case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
+    case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
+    case AOM_CODEC_LIST_END: return "End of iterated list";
+  }
+
+  return "Unrecognized error code";
+}
+
+const char *aom_codec_error(aom_codec_ctx_t *ctx) {
+  return (ctx) ? aom_codec_err_to_string(ctx->err)
+               : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
+}
+
+const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
+  if (ctx && ctx->err)
+    return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
+
+  return NULL;
+}
+
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
+  aom_codec_err_t res;
+
+  if (!ctx)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else {
+    ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
+
+    ctx->iface = NULL;
+    ctx->name = NULL;
+    ctx->priv = NULL;
+    res = AOM_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
+  return (iface) ? iface->caps : 0;
+}
+
+aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
+  aom_codec_err_t res;
+
+  if (!ctx || !ctrl_id)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
+    res = AOM_CODEC_ERROR;
+  else {
+    aom_codec_ctrl_fn_map_t *entry;
+
+    res = AOM_CODEC_ERROR;
+
+    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
+      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
+        va_list ap;
+
+        va_start(ap, ctrl_id);
+        res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
+        va_end(ap);
+        break;
+      }
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+void aom_internal_error(struct aom_internal_error_info *info,
+                        aom_codec_err_t error, const char *fmt, ...) {
+  va_list ap;
+
+  info->error_code = error;
+  info->has_detail = 0;
+
+  if (fmt) {
+    size_t sz = sizeof(info->detail);
+
+    info->has_detail = 1;
+    va_start(ap, fmt);
+    vsnprintf(info->detail, sz - 1, fmt, ap);
+    va_end(ap);
+    info->detail[sz - 1] = '\0';
+  }
+
+  if (info->setjmp) longjmp(info->jmp, info->error_code);
+}
--- a/aom/src/aom_decoder.c
+++ b/aom/src/aom_decoder.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <string.h>
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+  return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_dec_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver) {
+  aom_codec_err_t res;
+
+  if (ver != AOM_DECODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if ((flags & AOM_CODEC_USE_POSTPROC) &&
+           !(iface->caps & AOM_CODEC_CAP_POSTPROC))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_ERROR_CONCEALMENT) &&
+           !(iface->caps & AOM_CODEC_CAP_ERROR_CONCEALMENT))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_INPUT_FRAGMENTS) &&
+           !(iface->caps & AOM_CODEC_CAP_INPUT_FRAGMENTS))
+    res = AOM_CODEC_INCAPABLE;
+  else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.dec = cfg;
+
+    res = ctx->iface->init(ctx, NULL);
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      aom_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
+                                           const uint8_t *data,
+                                           unsigned int data_sz,
+                                           aom_codec_stream_info_t *si) {
+  aom_codec_err_t res;
+
+  if (!iface || !data || !data_sz || !si ||
+      si->sz < sizeof(aom_codec_stream_info_t))
+    res = AOM_CODEC_INVALID_PARAM;
+  else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = iface->dec.peek_si(data, data_sz, si);
+  }
+
+  return res;
+}
+
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+                                          aom_codec_stream_info_t *si) {
+  aom_codec_err_t res;
+
+  if (!ctx || !si || si->sz < sizeof(aom_codec_stream_info_t))
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
+                                 unsigned int data_sz, void *user_priv,
+                                 long deadline) {
+  aom_codec_err_t res;
+
+  /* Sanity checks */
+  /* NULL data ptr allowed if data_sz is 0 too */
+  if (!ctx || (!data && data_sz) || (data && !data_sz))
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else {
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
+                                 deadline);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
+  aom_image_t *img;
+
+  if (!ctx || !iter || !ctx->iface || !ctx->priv)
+    img = NULL;
+  else
+    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
+
+  return img;
+}
+
+aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_frame_cb_fn_t cb,
+                                                void *user_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx || !cb)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv ||
+           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_FRAME))
+    res = AOM_CODEC_ERROR;
+  else {
+    ctx->priv->dec.put_frame_cb.u.put_frame = cb;
+    ctx->priv->dec.put_frame_cb.user_priv = user_priv;
+    res = AOM_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_slice_cb_fn_t cb,
+                                                void *user_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx || !cb)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv ||
+           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_SLICE))
+    res = AOM_CODEC_ERROR;
+  else {
+    ctx->priv->dec.put_slice_cb.u.put_slice = cb;
+    ctx->priv->dec.put_slice_cb.user_priv = user_priv;
+    res = AOM_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx || !cb_get || !cb_release) {
+    res = AOM_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv ||
+             !(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = AOM_CODEC_ERROR;
+  } else {
+    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
+                                    cb_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
--- a/aom/src/aom_encoder.c
+++ b/aom/src/aom_encoder.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap encoder algorithms.
+ *
+ */
+#include <limits.h>
+#include <string.h>
+#include "aom_config.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+  return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver) {
+  aom_codec_err_t res;
+
+  if (ver != AOM_ENCODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface || !cfg)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
+           !(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.enc = cfg;
+    res = ctx->iface->init(ctx, NULL);
+
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      aom_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_enc_init_multi_ver(
+    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
+    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  if (ver != AOM_ENCODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
+           !(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    int i;
+    void *mem_loc = NULL;
+
+    if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
+      for (i = 0; i < num_enc; i++) {
+        aom_codec_priv_enc_mr_cfg_t mr_cfg;
+
+        /* Validate down-sampling factor. */
+        if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
+            dsf->den > dsf->num) {
+          res = AOM_CODEC_INVALID_PARAM;
+          break;
+        }
+
+        mr_cfg.mr_low_res_mode_info = mem_loc;
+        mr_cfg.mr_total_resolutions = num_enc;
+        mr_cfg.mr_encoder_id = num_enc - 1 - i;
+        mr_cfg.mr_down_sampling_factor.num = dsf->num;
+        mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+        /* Force Key-frame synchronization. Namely, encoder at higher
+         * resolution always use the same frame_type chosen by the
+         * lowest-resolution encoder.
+         */
+        if (mr_cfg.mr_encoder_id) cfg->kf_mode = AOM_KF_DISABLED;
+
+        ctx->iface = iface;
+        ctx->name = iface->name;
+        ctx->priv = NULL;
+        ctx->init_flags = flags;
+        ctx->config.enc = cfg;
+        res = ctx->iface->init(ctx, &mr_cfg);
+
+        if (res) {
+          const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+          /* Destroy current ctx */
+          ctx->err_detail = error_detail;
+          aom_codec_destroy(ctx);
+
+          /* Destroy already allocated high-level ctx */
+          while (i) {
+            ctx--;
+            ctx->err_detail = error_detail;
+            aom_codec_destroy(ctx);
+            i--;
+          }
+        }
+
+        if (res) break;
+
+        ctx++;
+        cfg++;
+        dsf++;
+      }
+      ctx--;
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+                                             aom_codec_enc_cfg_t *cfg,
+                                             unsigned int usage) {
+  aom_codec_err_t res;
+  aom_codec_enc_cfg_map_t *map;
+  int i;
+
+  if (!iface || !cfg || usage > INT_MAX)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    res = AOM_CODEC_INVALID_PARAM;
+
+    for (i = 0; i < iface->enc.cfg_map_count; ++i) {
+      map = iface->enc.cfg_maps + i;
+      if (map->usage == (int)usage) {
+        *cfg = map->cfg;
+        cfg->g_usage = usage;
+        res = AOM_CODEC_OK;
+        break;
+      }
+    }
+  }
+
+  return res;
+}
+
+#if ARCH_X86 || ARCH_X86_64
+/* On X86, disable the x87 unit's internal 80 bit precision for better
+ * consistency with the SSE unit's 64 bit precision.
+ */
+#include "aom_ports/x86.h"
+#define FLOATING_POINT_INIT() \
+  do {                        \
+    unsigned short x87_orig_mode = x87_set_double_precision();
+#define FLOATING_POINT_RESTORE()       \
+  x87_set_control_word(x87_orig_mode); \
+  }                                    \
+  while (0)
+
+#else
+static void FLOATING_POINT_INIT() {}
+static void FLOATING_POINT_RESTORE() {}
+#endif
+
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                                 aom_codec_pts_t pts, unsigned long duration,
+                                 aom_enc_frame_flags_t flags,
+                                 unsigned long deadline) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  if (!ctx || (img && !duration))
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    unsigned int num_enc = ctx->priv->enc.total_encoders;
+
+    /* Execute in a normalized floating point environment, if the platform
+     * requires it.
+     */
+    FLOATING_POINT_INIT();
+
+    if (num_enc == 1)
+      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags,
+                                   deadline);
+    else {
+      /* Multi-resolution encoding:
+       * Encode multi-levels in reverse order. For example,
+       * if mr_total_resolutions = 3, first encode level 2,
+       * then encode level 1, and finally encode level 0.
+       */
+      int i;
+
+      ctx += num_enc - 1;
+      if (img) img += num_enc - 1;
+
+      for (i = num_enc - 1; i >= 0; i--) {
+        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration,
+                                          flags, deadline)))
+          break;
+
+        ctx--;
+        if (img) img--;
+      }
+      ctx++;
+    }
+
+    FLOATING_POINT_RESTORE();
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+                                                aom_codec_iter_t *iter) {
+  const aom_codec_cx_pkt_t *pkt = NULL;
+
+  if (ctx) {
+    if (!iter)
+      ctx->err = AOM_CODEC_INVALID_PARAM;
+    else if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
+  }
+
+  if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+    // If the application has specified a destination area for the
+    // compressed data, and the codec has not placed the data there,
+    // and it fits, copy it.
+    aom_codec_priv_t *const priv = ctx->priv;
+    char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
+
+    if (dst_buf && pkt->data.raw.buf != dst_buf &&
+        pkt->data.raw.sz + priv->enc.cx_data_pad_before +
+                priv->enc.cx_data_pad_after <=
+            priv->enc.cx_data_dst_buf.sz) {
+      aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
+
+      memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
+             pkt->data.raw.sz);
+      *modified_pkt = *pkt;
+      modified_pkt->data.raw.buf = dst_buf;
+      modified_pkt->data.raw.sz +=
+          priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after;
+      pkt = modified_pkt;
+    }
+
+    if (dst_buf == pkt->data.raw.buf) {
+      priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
+      priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
+    }
+  }
+
+  return pkt;
+}
+
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+                                          const aom_fixed_buf_t *buf,
+                                          unsigned int pad_before,
+                                          unsigned int pad_after) {
+  if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM;
+
+  if (buf) {
+    ctx->priv->enc.cx_data_dst_buf = *buf;
+    ctx->priv->enc.cx_data_pad_before = pad_before;
+    ctx->priv->enc.cx_data_pad_after = pad_after;
+  } else {
+    ctx->priv->enc.cx_data_dst_buf.buf = NULL;
+    ctx->priv->enc.cx_data_dst_buf.sz = 0;
+    ctx->priv->enc.cx_data_pad_before = 0;
+    ctx->priv->enc.cx_data_pad_after = 0;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) {
+  aom_image_t *img = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_preview)
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
+  }
+
+  return img;
+}
+
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) {
+  aom_fixed_buf_t *buf = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_glob_hdrs)
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
+  }
+
+  return buf;
+}
+
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+                                         const aom_codec_enc_cfg_t *cfg) {
+  aom_codec_err_t res;
+
+  if (!ctx || !ctx->iface || !ctx->priv || !cfg)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else
+    res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
+
+  return SAVE_STATUS(ctx, res);
+}
+
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list,
+                           const struct aom_codec_cx_pkt *pkt) {
+  if (list->cnt < list->max) {
+    list->pkts[list->cnt++] = *pkt;
+    return 0;
+  }
+
+  return 1;
+}
+
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) {
+  const aom_codec_cx_pkt_t *pkt;
+
+  if (!(*iter)) {
+    *iter = list->pkts;
+  }
+
+  pkt = (const aom_codec_cx_pkt_t *)*iter;
+
+  if ((size_t)(pkt - list->pkts) < list->cnt)
+    *iter = pkt + 1;
+  else
+    pkt = NULL;
+
+  return pkt;
+}
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
+                                     unsigned int d_w, unsigned int d_h,
+                                     unsigned int buf_align,
+                                     unsigned int stride_align,
+                                     unsigned char *img_data) {
+  unsigned int h, w, s, xcs, ycs, bps;
+  unsigned int stride_in_bytes;
+  int align;
+
+  /* Treat align==0 like align==1 */
+  if (!buf_align) buf_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (buf_align & (buf_align - 1)) goto fail;
+
+  /* Treat align==0 like align==1 */
+  if (!stride_align) stride_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (stride_align & (stride_align - 1)) goto fail;
+
+  /* Get sample size for this format */
+  switch (fmt) {
+    case AOM_IMG_FMT_RGB32:
+    case AOM_IMG_FMT_RGB32_LE:
+    case AOM_IMG_FMT_ARGB:
+    case AOM_IMG_FMT_ARGB_LE: bps = 32; break;
+    case AOM_IMG_FMT_RGB24:
+    case AOM_IMG_FMT_BGR24: bps = 24; break;
+    case AOM_IMG_FMT_RGB565:
+    case AOM_IMG_FMT_RGB565_LE:
+    case AOM_IMG_FMT_RGB555:
+    case AOM_IMG_FMT_RGB555_LE:
+    case AOM_IMG_FMT_UYVY:
+    case AOM_IMG_FMT_YUY2:
+    case AOM_IMG_FMT_YVYU: bps = 16; break;
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12: bps = 12; break;
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I440: bps = 16; break;
+    case AOM_IMG_FMT_I444: bps = 24; break;
+    case AOM_IMG_FMT_I42016: bps = 24; break;
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44016: bps = 32; break;
+    case AOM_IMG_FMT_I44416: bps = 48; break;
+    default: bps = 16; break;
+  }
+
+  /* Get chroma shift values for this format */
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216: xcs = 1; break;
+    default: xcs = 0; break;
+  }
+
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I440:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12:
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I44016: ycs = 1; break;
+    default: ycs = 0; break;
+  }
+
+  /* Calculate storage sizes given the chroma subsampling */
+  align = (1 << xcs) - 1;
+  w = (d_w + align) & ~align;
+  align = (1 << ycs) - 1;
+  h = (d_h + align) & ~align;
+  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
+  s = (s + stride_align - 1) & ~(stride_align - 1);
+  stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
+
+  /* Allocate the new image */
+  if (!img) {
+    img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
+
+    if (!img) goto fail;
+
+    img->self_allocd = 1;
+  } else {
+    memset(img, 0, sizeof(aom_image_t));
+  }
+
+  img->img_data = img_data;
+
+  if (!img_data) {
+    const uint64_t alloc_size = (fmt & AOM_IMG_FMT_PLANAR)
+                                    ? (uint64_t)h * s * bps / 8
+                                    : (uint64_t)h * s;
+
+    if (alloc_size != (size_t)alloc_size) goto fail;
+
+    img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
+    img->img_data_owner = 1;
+  }
+
+  if (!img->img_data) goto fail;
+
+  img->fmt = fmt;
+  img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  img->w = w;
+  img->h = h;
+  img->x_chroma_shift = xcs;
+  img->y_chroma_shift = ycs;
+  img->bps = bps;
+
+  /* Calculate strides */
+  img->stride[AOM_PLANE_Y] = img->stride[AOM_PLANE_ALPHA] = stride_in_bytes;
+  img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
+
+  /* Default viewport to entire image */
+  if (!aom_img_set_rect(img, 0, 0, d_w, d_h)) return img;
+
+fail:
+  aom_img_free(img);
+  return NULL;
+}
+
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
+                           unsigned int d_w, unsigned int d_h,
+                           unsigned int align) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
+}
+
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
+                          unsigned int d_h, unsigned int stride_align,
+                          unsigned char *img_data) {
+  /* By setting buf_align = 1, we don't change buffer alignment in this
+   * function. */
+  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
+}
+
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
+                     unsigned int w, unsigned int h) {
+  unsigned char *data;
+
+  if (x + w <= img->w && y + h <= img->h) {
+    img->d_w = w;
+    img->d_h = h;
+
+    /* Calculate plane pointers */
+    if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
+      img->planes[AOM_PLANE_PACKED] =
+          img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
+    } else {
+      const int bytes_per_sample =
+          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+      data = img->img_data;
+
+      if (img->fmt & AOM_IMG_FMT_HAS_ALPHA) {
+        img->planes[AOM_PLANE_ALPHA] =
+            data + x * bytes_per_sample + y * img->stride[AOM_PLANE_ALPHA];
+        data += img->h * img->stride[AOM_PLANE_ALPHA];
+      }
+
+      img->planes[AOM_PLANE_Y] =
+          data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
+      data += img->h * img->stride[AOM_PLANE_Y];
+
+      if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
+        img->planes[AOM_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
+        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
+        img->planes[AOM_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
+      } else {
+        img->planes[AOM_PLANE_V] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
+        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
+        img->planes[AOM_PLANE_U] =
+            data + (x >> img->x_chroma_shift) * bytes_per_sample +
+            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
+      }
+    }
+    return 0;
+  }
+  return -1;
+}
+
+void aom_img_flip(aom_image_t *img) {
+  /* Note: In the calculation pointer adjustment calculation, we want the
+   * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
+   * standard indicates that if the adjustment parameter is unsigned, the
+   * stride parameter will be promoted to unsigned, causing errors when
+   * the lhs is a larger type than the rhs.
+   */
+  img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
+  img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
+
+  img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[AOM_PLANE_U];
+  img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
+
+  img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[AOM_PLANE_V];
+  img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
+
+  img->planes[AOM_PLANE_ALPHA] +=
+      (signed)(img->d_h - 1) * img->stride[AOM_PLANE_ALPHA];
+  img->stride[AOM_PLANE_ALPHA] = -img->stride[AOM_PLANE_ALPHA];
+}
+
+void aom_img_free(aom_image_t *img) {
+  if (img) {
+    if (img->img_data && img->img_data_owner) aom_free(img->img_data);
+
+    if (img->self_allocd) free(img);
+  }
+}
--- a/aom_dsp/add_noise.c
+++ b/aom_dsp/add_noise.c
@@ -11,27 +11,27 @@
 #include <math.h>
 #include <stdlib.h>

-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"

-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/postproc.h"
-#include "vpx_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
+                           char whiteclamp[16], char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;

-void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
-                           int whiteclamp, int width, int height, int pitch) {
-  int i, j;
-  int bothclamp = blackclamp + whiteclamp;
  for (i = 0; i < height; ++i) {
    uint8_t *pos = start + i * pitch;
-    const int8_t *ref = (const int8_t *)(noise + (rand() & 0xff));  // NOLINT
+    char *ref = (char *)(noise + (rand() & 0xff));  // NOLINT

    for (j = 0; j < width; ++j) {
      int v = pos[j];

-      v = clamp(v - blackclamp, 0, 255);
-      v = clamp(v + bothclamp, 0, 255);
-      v = clamp(v - whiteclamp, 0, 255);
+      v = clamp(v - blackclamp[0], 0, 255);
+      v = clamp(v + bothclamp[0], 0, 255);
+      v = clamp(v - whiteclamp[0], 0, 255);

      pos[j] = v + ref[j];
    }
@@ -43,8 +43,8 @@ static double gaussian(double sigma, double mu, double x) {
         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
 }

-int vpx_setup_noise(double sigma, int8_t *noise, int size) {
-  int8_t char_dist[256];
+int aom_setup_noise(double sigma, int size, char *noise) {
+  char char_dist[256];
  int next = 0, i, j;

  // set up a 256 entry lookup that matches gaussian distribution
@@ -52,7 +52,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
    const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
    if (a_i) {
      for (j = 0; j < a_i; ++j) {
-        char_dist[next + j] = (int8_t)i;
+        char_dist[next + j] = (char)i;
      }
      next = next + j;
    }
--- a/aom_dsp/ans.c
+++ b/aom_dsp/ans.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/ans.h"
+#include "aom_dsp/prob.h"
+
+static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
+  int largest_idx = -1;
+  int largest_p = -1;
+  int i;
+  for (i = 0; i < num_syms; ++i) {
+    int p = pdf_tab[i];
+    if (p > largest_p) {
+      largest_p = p;
+      largest_idx = i;
+    }
+  }
+  return largest_idx;
+}
+
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+                              const AnsP8 node_prob,
+                              const aom_cdf_prob *const src_pdf, int in_syms) {
+  int i;
+  int adjustment = RANS_PRECISION;
+  const int round_fact = ANS_P8_PRECISION >> 1;
+  const AnsP8 p1 = ANS_P8_PRECISION - node_prob;
+  const int out_syms = in_syms + 1;
+  assert(src_pdf != out_pdf);
+
+  out_pdf[0] = node_prob << (RANS_PROB_BITS - ANS_P8_SHIFT);
+  adjustment -= out_pdf[0];
+  for (i = 0; i < in_syms; ++i) {
+    int p = (p1 * src_pdf[i] + round_fact) >> ANS_P8_SHIFT;
+    p = AOMMIN(p, (int)RANS_PRECISION - in_syms);
+    p = AOMMAX(p, 1);
+    out_pdf[i + 1] = p;
+    adjustment -= p;
+  }
+
+  // Adjust probabilities so they sum to the total probability
+  if (adjustment > 0) {
+    i = find_largest(out_pdf, out_syms);
+    out_pdf[i] += adjustment;
+  } else {
+    while (adjustment < 0) {
+      i = find_largest(out_pdf, out_syms);
+      --out_pdf[i];
+      assert(out_pdf[i] > 0);
+      adjustment++;
+    }
+  }
+}
--- a/aom_dsp/ans.h
+++ b/aom_dsp/ans.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_ANS_H_
+#define AOM_DSP_ANS_H_
+// Constants, types and utilities for Asymmetric Numeral Systems
+// http://arxiv.org/abs/1311.2540v2
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef uint8_t AnsP8;
+#define ANS_P8_PRECISION 256u
+#define ANS_P8_SHIFT 8
+#define RANS_PROB_BITS 15
+#define RANS_PRECISION (1u << RANS_PROB_BITS)
+
+// L_BASE % PRECISION must be 0. Increasing L_BASE beyond 2**15 will cause uabs
+// to overflow.
+#define L_BASE (RANS_PRECISION)
+#define IO_BASE 256
+// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
+
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+                              const AnsP8 node_prob,
+                              const aom_cdf_prob *const src_pdf, int in_syms);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_DSP_ANS_H_
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_ANSREADER_H_
+#define AOM_DSP_ANSREADER_H_
+// A uABS and rANS decoder implementation of Asymmetric Numeral Systems
+// http://arxiv.org/abs/1311.2540v2
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+#include "aom_dsp/ans.h"
+#include "aom_ports/mem_ops.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct AnsDecoder {
+  const uint8_t *buf;
+  int buf_offset;
+  uint32_t state;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+};
+
+static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
+  AnsP8 p = ANS_P8_PRECISION - p0;
+  int s;
+  unsigned xp, sp;
+  unsigned state = ans->state;
+  while (state < L_BASE && ans->buf_offset > 0) {
+    state = state * IO_BASE + ans->buf[--ans->buf_offset];
+  }
+  sp = state * p;
+  xp = sp / ANS_P8_PRECISION;
+  s = (sp & 0xFF) >= p0;
+  if (s)
+    ans->state = xp;
+  else
+    ans->state = state - xp;
+  return s;
+}
+
+static INLINE int uabs_read_bit(struct AnsDecoder *ans) {
+  int s;
+  unsigned state = ans->state;
+  while (state < L_BASE && ans->buf_offset > 0) {
+    state = state * IO_BASE + ans->buf[--ans->buf_offset];
+  }
+  s = (int)(state & 1);
+  ans->state = state >> 1;
+  return s;
+}
+
+struct rans_dec_sym {
+  uint8_t val;
+  aom_cdf_prob prob;
+  aom_cdf_prob cum_prob;  // not-inclusive
+};
+
+static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
+                             aom_cdf_prob rem) {
+  int i;
+  aom_cdf_prob cum_prob = 0, top_prob;
+  // TODO(skal): if critical, could be a binary search.
+  // Or, better, an O(1) alias-table.
+  for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
+    cum_prob = top_prob;
+  }
+  out->val = i;
+  out->prob = top_prob - cum_prob;
+  out->cum_prob = cum_prob;
+}
+
+static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
+  unsigned rem;
+  unsigned quo;
+  struct rans_dec_sym sym;
+  while (ans->state < L_BASE && ans->buf_offset > 0) {
+    ans->state = ans->state * IO_BASE + ans->buf[--ans->buf_offset];
+  }
+  quo = ans->state / RANS_PRECISION;
+  rem = ans->state % RANS_PRECISION;
+  fetch_sym(&sym, tab, rem);
+  ans->state = quo * sym.prob + rem - sym.cum_prob;
+  return sym.val;
+}
+
+static INLINE int ans_read_init(struct AnsDecoder *const ans,
+                                const uint8_t *const buf, int offset) {
+  unsigned x;
+  if (offset < 1) return 1;
+  ans->buf = buf;
+  x = buf[offset - 1] >> 6;
+  if (x == 0) {
+    ans->buf_offset = offset - 1;
+    ans->state = buf[offset - 1] & 0x3F;
+  } else if (x == 1) {
+    if (offset < 2) return 1;
+    ans->buf_offset = offset - 2;
+    ans->state = mem_get_le16(buf + offset - 2) & 0x3FFF;
+  } else if (x == 2) {
+    if (offset < 3) return 1;
+    ans->buf_offset = offset - 3;
+    ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
+  } else if ((buf[offset - 1] & 0xE0) == 0xE0) {
+    if (offset < 4) return 1;
+    ans->buf_offset = offset - 4;
+    ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
+  } else {
+    // 110xxxxx implies this byte is a superframe marker
+    return 1;
+  }
+#if CONFIG_ACCOUNTING
+  ans->accounting = NULL;
+#endif
+  ans->state += L_BASE;
+  if (ans->state >= L_BASE * IO_BASE) return 1;
+  return 0;
+}
+
+static INLINE int ans_read_end(struct AnsDecoder *const ans) {
+  return ans->state == L_BASE;
+}
+
+static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
+  return ans->state < L_BASE && ans->buf_offset == 0;
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_DSP_ANSREADER_H_
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_ANSWRITER_H_
+#define AOM_DSP_ANSWRITER_H_
+// A uABS and rANS encoder implementation of Asymmetric Numeral Systems
+// http://arxiv.org/abs/1311.2540v2
+
+#include <assert.h>
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/ans.h"
+#include "aom_dsp/prob.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/common/odintrin.h"
+
+#if RANS_PRECISION <= OD_DIVU_DMAX
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do {                                                     \
+    quotient = OD_DIVU_SMALL((dividend), (divisor));       \
+    remainder = (dividend) - (quotient) * (divisor);       \
+  } while (0)
+#else
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do {                                                     \
+    quotient = (dividend) / (divisor);                     \
+    remainder = (dividend) % (divisor);                    \
+  } while (0)
+#endif
+
+#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct AnsCoder {
+  uint8_t *buf;
+  int buf_offset;
+  uint32_t state;
+};
+
+static INLINE void ans_write_init(struct AnsCoder *const ans,
+                                  uint8_t *const buf) {
+  ans->buf = buf;
+  ans->buf_offset = 0;
+  ans->state = L_BASE;
+}
+
+static INLINE int ans_write_end(struct AnsCoder *const ans) {
+  uint32_t state;
+  assert(ans->state >= L_BASE);
+  assert(ans->state < L_BASE * IO_BASE);
+  state = ans->state - L_BASE;
+  if (state < (1 << 6)) {
+    ans->buf[ans->buf_offset] = (0x00 << 6) + state;
+    return ans->buf_offset + 1;
+  } else if (state < (1 << 14)) {
+    mem_put_le16(ans->buf + ans->buf_offset, (0x01 << 14) + state);
+    return ans->buf_offset + 2;
+  } else if (state < (1 << 22)) {
+    mem_put_le24(ans->buf + ans->buf_offset, (0x02 << 22) + state);
+    return ans->buf_offset + 3;
+  } else if (state < (1 << 29)) {
+    mem_put_le32(ans->buf + ans->buf_offset, (0x07 << 29) + state);
+    return ans->buf_offset + 4;
+  } else {
+    assert(0 && "State is too large to be serialized");
+    return ans->buf_offset;
+  }
+}
+
+// uABS with normalization
+static INLINE void uabs_write(struct AnsCoder *ans, int val, AnsP8 p0) {
+  AnsP8 p = ANS_P8_PRECISION - p0;
+  const unsigned l_s = val ? p : p0;
+  while (ans->state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
+    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
+    ans->state /= IO_BASE;
+  }
+  if (!val)
+    ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
+  else
+    ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
+}
+
+struct rans_sym {
+  aom_cdf_prob prob;
+  aom_cdf_prob cum_prob;  // not-inclusive
+};
+
+// rANS with normalization
+// sym->prob takes the place of l_s from the paper
+// ANS_P10_PRECISION is m
+static INLINE void rans_write(struct AnsCoder *ans,
+                              const struct rans_sym *const sym) {
+  const aom_cdf_prob p = sym->prob;
+  unsigned quot, rem;
+  while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
+    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
+    ans->state /= IO_BASE;
+  }
+  ANS_DIVREM(quot, rem, ans->state, p);
+  ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
+}
+
+#undef ANS_DIV8
+#undef ANS_DIVREM
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_DSP_ANSWRITER_H_
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *x_filters, int x0_q4,
+                               int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
+          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *y_filters, int y0_q4,
+                              int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+          dst[y * dst_stride] +
+              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+          1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+                     int x0_q4, int x_step_q4,
+                     const InterpKernel *const y_filters, int y0_q4,
+                     int y_step_q4, int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+  int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+                 intermediate_height);
+  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
+                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+                 w, h);
+}
+
+void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                     x_step_q4, w, h);
+}
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4, int w,
+                          int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+                w, h);
+}
+
+void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                    y_step_q4, w, h);
+}
+
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const int16_t *filter_x,
+                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+           filters_y, y0_q4, y_step_q4, w, h);
+}
+
+void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  /* Fixed size intermediate buffer places limits on parameters. */
+  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
+                  filter_y, y_step_q4, w, h);
+  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
+                     h);
+}
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int filter_x_stride, const int16_t *filter_y,
+                         int filter_y_stride, int w, int h) {
+  int r;
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
+                        int filter_x_stride, const int16_t *filter_y,
+                        int filter_y_stride, int w, int h) {
+  int x, y;
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
+                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                        filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const int16_t *filter_x,
+                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                       int w, int h) {
+  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                       filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const int16_t *filter_x,
+                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                     int w, int h) {
+  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                  filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
+  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
+  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
+  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+                      filter_y, y_step_q4, w, h);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters, int x0_q4,
+                                  int x_step_q4, int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                      uint8_t *dst8, ptrdiff_t dst_stride,
+                                      const InterpKernel *x_filters, int x0_q4,
+                                      int x_step_q4, int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
+          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+          1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters, int y0_q4,
+                                 int y_step_q4, int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const InterpKernel *y_filters, int y0_q4,
+                                     int y_step_q4, int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+          dst[y * dst_stride] +
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+          1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *const x_filters, int x0_q4,
+                            int x_step_q4, const InterpKernel *const y_filters,
+                            int y0_q4, int y_step_q4, int w, int h, int bd) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+  int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
+                        x_step_q4, w, intermediate_height, bd);
+  highbd_convolve_vert(
+      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                        x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                            x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                       y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                           y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+                  filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int x_step_q4,
+                                const int16_t *filter_y, int y_step_q4, int w,
+                                int h, int bd) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
+                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
+  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
+                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
+}
+
+void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                uint8_t *dst8, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int filter_x_stride,
+                                const int16_t *filter_y, int filter_y_stride,
+                                int w, int h, int bd) {
+  int r;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w * sizeof(uint16_t));
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+                               uint8_t *dst8, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int filter_x_stride,
+                               const int16_t *filter_y, int filter_y_stride,
+                               int w, int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+#endif
--- a/aom_dsp/aom_convolve.h
+++ b/aom_dsp/aom_convolve.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_DSP_AOM_CONVOLVE_H_
+#define AOM_DSP_AOM_CONVOLVE_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 64x64 pixels.
+// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define MAX_EXT_SIZE 263
+#else
+#define MAX_EXT_SIZE 135
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_DSP_AOM_CONVOLVE_H_
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -0,0 +1,426 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+
+DSP_SRCS-yes += aom_dsp.mk
+DSP_SRCS-yes += aom_dsp_common.h
+
+DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/synonyms.h
+
+# bit reader
+DSP_SRCS-yes += prob.h
+DSP_SRCS-yes += prob.c
+DSP_SRCS-$(CONFIG_ANS) += ans.h
+DSP_SRCS-$(CONFIG_ANS) += ans.c
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-$(CONFIG_ANS) += answriter.h
+DSP_SRCS-yes += bitwriter.h
+DSP_SRCS-yes += dkboolwriter.h
+DSP_SRCS-yes += dkboolwriter.c
+DSP_SRCS-yes += bitwriter_buffer.c
+DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-yes += psnr.c
+DSP_SRCS-yes += psnr.h
+DSP_SRCS-$(CONFIG_ANS) += buf_ans.h
+DSP_SRCS-$(CONFIG_ANS) += buf_ans.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+endif
+
+ifeq ($(CONFIG_DECODERS),yes)
+DSP_SRCS-$(CONFIG_ANS) += ansreader.h
+DSP_SRCS-yes += bitreader.h
+DSP_SRCS-yes += dkboolreader.h
+DSP_SRCS-yes += dkboolreader.c
+DSP_SRCS-yes += bitreader_buffer.c
+DSP_SRCS-yes += bitreader_buffer.h
+endif
+
+# intra predictions
+DSP_SRCS-yes += intrapred.c
+
+ifeq ($(CONFIG_DAALA_EC),yes)
+DSP_SRCS-yes += entenc.c
+DSP_SRCS-yes += entenc.h
+DSP_SRCS-yes += entdec.c
+DSP_SRCS-yes += entdec.h
+DSP_SRCS-yes += entcode.c
+DSP_SRCS-yes += entcode.h
+DSP_SRCS-yes += daalaboolreader.c
+DSP_SRCS-yes += daalaboolreader.h
+DSP_SRCS-yes += daalaboolwriter.c
+DSP_SRCS-yes += daalaboolwriter.h
+endif
+
+DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
+
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+endif  # CONFIG_AOM_HIGHBITDEPTH
+
+DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
+DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
+
+DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
+
+# inter predictions
+DSP_SRCS-yes            += blend.h
+DSP_SRCS-yes            += blend_a64_mask.c
+DSP_SRCS-yes            += blend_a64_hmask.c
+DSP_SRCS-yes            += blend_a64_vmask.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
+
+# interpolation filters
+DSP_SRCS-yes += aom_convolve.c
+DSP_SRCS-yes += aom_convolve.h
+DSP_SRCS-yes += aom_filter.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2)  += x86/aom_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_bilinear_sse2.asm
+endif
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_convolve_copy_sse2.asm
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/aom_convolve_copy_neon.c
+DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/aom_convolve8_neon.c
+DSP_SRCS-yes += arm/aom_convolve_avg_neon.c
+DSP_SRCS-yes += arm/aom_convolve_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h
+
+# common (dspr2)
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve2_vert_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_avg_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve8_vert_dspr2.c
+
+# loop filters
+DSP_SRCS-yes += loopfilter.c
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+
+DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/loopfilter_mb_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_8_neon$(ASM)
+DSP_SRCS-yes  += arm/loopfilter_4_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes   += arm/loopfilter_16_neon.c
+DSP_SRCS-yes   += arm/loopfilter_8_neon.c
+DSP_SRCS-yes   += arm/loopfilter_4_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_16_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_8_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/loopfilter_4_msa.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_filters_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_macros_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_masks_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
+
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
+endif  # CONFIG_AOM_HIGHBITDEPTH
+
+DSP_SRCS-yes            += txfm_common.h
+DSP_SRCS-yes            += x86/txfm_common_intrin.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
+DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
+# forward transform
+ifeq ($(CONFIG_AV1),yes)
+DSP_SRCS-yes            += fwd_txfm.c
+DSP_SRCS-yes            += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32_8cols_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
+endif
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.h
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/txfm_common_avx2.h
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+endif  # CONFIG_AV1_ENCODER
+
+ifeq ($(CONFIG_PVQ),yes)
+DSP_SRCS-yes            += fwd_txfm.c
+DSP_SRCS-yes            += fwd_txfm.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
+endif
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+endif  # CONFIG_PVQ
+
+# inverse transform
+ifeq ($(CONFIG_AV1), yes)
+DSP_SRCS-yes            += inv_txfm.h
+DSP_SRCS-yes            += inv_txfm.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3_x86_64.asm
+endif  # ARCH_X86_64
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes  += arm/save_reg_neon$(ASM)
+DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
+DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
+DSP_SRCS-yes  += arm/idct4x4_add_neon.c
+DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
+DSP_SRCS-yes  += arm/idct8x8_add_neon.c
+DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
+DSP_SRCS-yes  += arm/idct16x16_add_neon.c
+DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
+DSP_SRCS-yes  += arm/idct32x32_add_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
+
+DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
+DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
+DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
+
+ifneq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
+DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+endif  # CONFIG_AOM_HIGHBITDEPTH
+endif  # CONFIG_AV1
+
+# quantization
+ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
+DSP_SRCS-yes            += quantize.c
+DSP_SRCS-yes            += quantize.h
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
+endif
+
+# avg
+DSP_SRCS-yes           += avg.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
+endif
+
+# high bit depth subtract
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
+endif
+
+endif  # CONFIG_AV1_ENCODER
+
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+DSP_SRCS-yes            += sum_squares.c
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
+endif # CONFIG_AV1_ENCODER
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes            += sad.c
+DSP_SRCS-yes            += subtract.c
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
+
+DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
+
+DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
+endif  #CONFIG_EXT_INTER
+ifeq ($(CONFIG_MOTION_VAR),yes)
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
+endif  #CONFIG_MOTION_VAR
+endif  #CONFIG_AV1_ENCODER
+
+DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
+
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+endif  # CONFIG_AOM_HIGHBITDEPTH
+
+endif  # CONFIG_ENCODERS
+
+ifneq ($(filter yes,$(CONFIG_ENCODERS)),)
+DSP_SRCS-yes            += variance.c
+DSP_SRCS-yes            += variance.h
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/bilinear_filter_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/subpel_variance_media.c
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_h_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_hv_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_v_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
+
+DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
+DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
+
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
+endif  # ARCH_X86_64
+
+DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
+
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+endif  # CONFIG_AOM_HIGHBITDEPTH
+endif  # CONFIG_ENCODERS
+
+DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
+
+DSP_SRCS-yes += aom_dsp_rtcd.c
+DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
+
+DSP_SRCS-yes += aom_simd.c
+DSP_SRCS-yes += aom_simd.h
+DSP_SRCS-yes += aom_simd_inline.h
+DSP_SRCS-yes += simd/v64_intrinsics.h
+DSP_SRCS-yes += simd/v64_intrinsics_c.h
+DSP_SRCS-yes += simd/v128_intrinsics.h
+DSP_SRCS-yes += simd/v128_intrinsics_c.h
+DSP_SRCS-yes += simd/v256_intrinsics.h
+DSP_SRCS-yes += simd/v256_intrinsics_c.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
+DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
+DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
+DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
+
+$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_DSP_COMMON_H_
+#define AOM_DSP_AOM_DSP_COMMON_H_
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MAX_SB_SIZE
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE 128
+#else
+#define MAX_SB_SIZE 64
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+#endif  // ndef MAX_SB_SIZE
+
+#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
+
+#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
+
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+#define AOM_SWAP(type, a, b) \
+  do {                       \
+    type c = (b);            \
+    b = a;                   \
+    a = c;                   \
+  } while (0)
+
+#if CONFIG_AOM_QM
+typedef uint16_t qm_val_t;
+#define AOM_QM_BITS 6
+#endif
+#if CONFIG_AOM_HIGHBITDEPTH
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+  switch (bd) {
+    case 8:
+    default: return (uint16_t)clamp(val, 0, 255);
+    case 10: return (uint16_t)clamp(val, 0, 1023);
+    case 12: return (uint16_t)clamp(val, 0, 4095);
+  }
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_DSP_AOM_DSP_COMMON_H_
--- a/aom_dsp/aom_dsp_rtcd.c
+++ b/aom_dsp/aom_dsp_rtcd.c
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "./aom_config.h"
+#define RTCD_C
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/aom_once.h"
+
+void aom_dsp_rtcd() { once(setup_rtcd_internal); }
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
--- a/aom_dsp/aom_filter.h
+++ b/aom_dsp/aom_filter.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_FILTER_H_
+#define AOM_DSP_AOM_FILTER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_DSP_AOM_FILTER_H_
--- a/aom_dsp/aom_simd.c
+++ b/aom_dsp/aom_simd.c
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Set to 1 to add some sanity checks in the fallback C code
+const int simd_check = 1;
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_AOM_SIMD_H_
+#define AOM_DSP_AOM_AOM_SIMD_H_
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "./aom_config.h"
+#include "./aom_simd_inline.h"
+
+#if HAVE_NEON
+#include "simd/v256_intrinsics_arm.h"
+#elif HAVE_SSE2
+#include "simd/v256_intrinsics_x86.h"
+#else
+#include "simd/v256_intrinsics.h"
+#endif
+
+#endif  // AOM_DSP_AOM_AOM_SIMD_H_
--- a/aom_dsp/aom_simd_inline.h
+++ b/aom_dsp/aom_simd_inline.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_AOM_SIMD_INLINE_H_
+#define AOM_DSP_AOM_SIMD_INLINE_H_
+
+#include "aom/aom_integer.h"
+
+#ifndef SIMD_INLINE
+#define SIMD_INLINE static AOM_FORCE_INLINE
+#endif
+
+#endif  // AOM_DSP_AOM_SIMD_INLINE_H_
--- a/aom_dsp/arm/aom_convolve8_avg_neon.c
+++ b/aom_dsp/arm/aom_convolve8_avg_neon.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
+                                       int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y,  // unused
+                                  int y_step_q4,            // unused
+                                  int w, int h) {
+  int width;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_y;
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;                // adjust for taps
+  for (; h > 0; h -= 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 =
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    src += 7;
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
+    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
+      s = src;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(src + 64);
+
+      d0x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+      d1x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(src + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 =
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(src + 64 + src_stride * 2);
+
+      d = dst;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+                             d23s16, d24s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+                             d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(src + 64 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      d = dst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+    src += src_stride * 4 - w - 7;
+    dst += dst_stride * 4 - w;
+  }
+  return;
+}
+
+void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x,  // unused
+                                 int x_step_q4,            // unused
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint8x8_t d2u8, d3u8;
+  uint32x2_t d2u32, d3u32, d6u32, d7u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  uint8x16_t q1u8, q3u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_x;
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+      d += dst_stride;
+      d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+      d += dst_stride;
+      d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+      d -= dst_stride * 3;
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+                             d22s16, d24s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+                             d24s16, d26s16, q0s16);
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+      d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
--- a/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
@@ -1,11 +1,14 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
 ;


@@ -14,11 +17,11 @@
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
+    ; AV1_FILTER_WEIGHT == 128
+    ; AV1_FILTER_SHIFT == 7

-    EXPORT  |vpx_convolve8_avg_horiz_neon|
-    EXPORT  |vpx_convolve8_avg_vert_neon|
+    EXPORT  |aom_convolve8_avg_horiz_neon|
+    EXPORT  |aom_convolve8_avg_vert_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -42,24 +45,23 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
 ; sp[]int y_step_q4           ; unused
 ; sp[]int w
 ; sp[]int h

-|vpx_convolve8_avg_horiz_neon| PROC
+|aom_convolve8_avg_horiz_neon| PROC
    push            {r4-r10, lr}

    sub             r0, r0, #3              ; adjust for taps

-    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
-    add             r4, r5, lsl #4
-    ldrd            r6, r7, [sp, #52]       ; w, h
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h

-    vld1.s16        {q0}, [r4]              ; filter
+    vld1.s16        {q0}, [r5]              ; filter_x

    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -73,7 +75,7 @@

    mov             r10, r6                 ; w loop counter

-vpx_convolve8_avg_loop_horiz_v
+aom_convolve8_avg_loop_horiz_v
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
@@ -96,7 +98,7 @@ vpx_convolve8_avg_loop_horiz_v

    add             r0, r0, #3

-vpx_convolve8_avg_loop_horiz
+aom_convolve8_avg_loop_horiz
    add             r5, r0, #64

    vld1.32         {d28[]}, [r0], r1
@@ -128,7 +130,7 @@ vpx_convolve8_avg_loop_horiz

    sub             r2, r2, r3, lsl #2      ; reset for store

-    ; src[] * filter
+    ; src[] * filter_x
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -165,38 +167,36 @@ vpx_convolve8_avg_loop_horiz
    vmov            q9,  q13

    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_horiz
+    bgt             aom_convolve8_avg_loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_avg_loop_horiz_v
+    bgt aom_convolve8_avg_loop_horiz_v

    pop             {r4-r10, pc}

    ENDP

-|vpx_convolve8_avg_vert_neon| PROC
+|aom_convolve8_avg_vert_neon| PROC
    push            {r4-r8, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #24]           ; filter
-    ldr             r5, [sp, #36]           ; y0_q4
-    add             r4, r5, lsl #4
-    ldr             r6, [sp, #44]           ; w
-    ldr             lr, [sp, #48]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h

-    vld1.s16        {q0}, [r4]              ; filter
+    vld1.s16        {q0}, [r4]              ; filter_y

    lsl             r1, r1, #1
    lsl             r3, r3, #1

-vpx_convolve8_avg_loop_vert_h
+aom_convolve8_avg_loop_vert_h
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
@@ -216,7 +216,7 @@ vpx_convolve8_avg_loop_vert_h
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22

-vpx_convolve8_avg_loop_vert
+aom_convolve8_avg_loop_vert
    ; always process a 4x4 block at a time
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
@@ -235,7 +235,7 @@ vpx_convolve8_avg_loop_vert
    pld             [r7]
    pld             [r4]

-    ; src[] * filter
+    ; src[] * filter_y
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r7, r1]
@@ -281,13 +281,13 @@ vpx_convolve8_avg_loop_vert
    vmov            d22, d25

    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_avg_loop_vert
+    bgt             aom_convolve8_avg_loop_vert

    ; outer loop
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_vert_h
+    bgt             aom_convolve8_avg_loop_vert_h

    pop             {r4-r8, pc}

--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
+                                       int16x8_t q0s16) {
+  int32x4_t qdst;
+  int16x4_t d0s16, d1s16;
+
+  d0s16 = vget_low_s16(q0s16);
+  d1s16 = vget_high_s16(q0s16);
+
+  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+  return qdst;
+}
+
+void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y,  // unused
+                              int y_step_q4,            // unused
+                              int w, int h) {
+  int width;
+  const uint8_t *s, *psrc;
+  uint8_t *d, *pdst;
+  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+  uint8x16_t q12u8, q13u8, q14u8, q15u8;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+  uint16x8x2_t q0x2u16;
+  uint8x8x2_t d0x2u8, d1x2u8;
+  uint32x2x2_t d0x2u32;
+  uint16x4x2_t d0x2u16, d1x2u16;
+  uint32x4x2_t q0x2u32;
+
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_y;
+
+  q0s16 = vld1q_s16(filter_x);
+
+  src -= 3;  // adjust for taps
+  for (; h > 0; h -= 4, src += src_stride * 4,
+                dst += dst_stride * 4) {  // loop_horiz_v
+    s = src;
+    d24u8 = vld1_u8(s);
+    s += src_stride;
+    d25u8 = vld1_u8(s);
+    s += src_stride;
+    d26u8 = vld1_u8(s);
+    s += src_stride;
+    d27u8 = vld1_u8(s);
+
+    q12u8 = vcombine_u8(d24u8, d25u8);
+    q13u8 = vcombine_u8(d26u8, d27u8);
+
+    q0x2u16 =
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
+    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+    d0x2u8 = vtrn_u8(d24u8, d25u8);
+    d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+    __builtin_prefetch(src + src_stride * 4);
+    __builtin_prefetch(src + src_stride * 5);
+    __builtin_prefetch(src + src_stride * 6);
+
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
+    q10u16 = vmovl_u8(d1x2u8.val[0]);
+    q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+    d16u16 = vget_low_u16(q8u16);
+    d17u16 = vget_high_u16(q8u16);
+    d18u16 = vget_low_u16(q9u16);
+    d19u16 = vget_high_u16(q9u16);
+    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+    q9u16 = vcombine_u16(d17u16, d19u16);
+
+    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    for (width = w, psrc = src + 7, pdst = dst; width > 0;
+         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
+      s = psrc;
+      d28u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d29u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d31u32 = vld1_dup_u32((const uint32_t *)s);
+      s += src_stride;
+      d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+      __builtin_prefetch(psrc + 64);
+
+      d0x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
+      d1x2u16 =
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+      __builtin_prefetch(psrc + 64 + src_stride);
+
+      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+      q0x2u32 =
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
+
+      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+      q12u16 = vmovl_u8(d28u8);
+      q13u16 = vmovl_u8(d29u8);
+
+      __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
+                             d23s16, d24s16, q0s16);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
+                             d24s16, d26s16, q0s16);
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u8 = vqmovn_u16(q1u16);
+      d3u8 = vqmovn_u16(q2u16);
+
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
+      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                         vreinterpret_u32_u16(d0x2u16.val[1]));
+      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                       vreinterpret_u8_u32(d0x2u32.val[1]));
+
+      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+      d = pdst;
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+      q8u16 = q9u16;
+      d20s16 = d23s16;
+      q11u16 = q12u16;
+      q9u16 = q13u16;
+      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    }
+  }
+  return;
+}
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x,  // unused
+                             int x_step_q4,            // unused
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  int height;
+  const uint8_t *s;
+  uint8_t *d;
+  uint32x2_t d2u32, d3u32;
+  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16;
+  uint16x4_t d2u16, d3u16, d4u16, d5u16;
+  int16x8_t q0s16;
+  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+  int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+  assert(y_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_x;
+
+  src -= src_stride * 3;
+  q0s16 = vld1q_s16(filter_y);
+  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+    s = src;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+    s += src_stride;
+    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+    s += src_stride;
+    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+    s += src_stride;
+    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+    s += src_stride;
+    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+    s += src_stride;
+
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d = dst;
+    for (height = h; height > 0; height -= 4) {  // loop_vert
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+      s += src_stride;
+      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+      s += src_stride;
+      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+      s += src_stride;
+
+      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+      __builtin_prefetch(d);
+      __builtin_prefetch(d + dst_stride);
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
+                             d22s16, d24s16, q0s16);
+      __builtin_prefetch(d + dst_stride * 2);
+      __builtin_prefetch(d + dst_stride * 3);
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
+                             d24s16, d26s16, q0s16);
+      __builtin_prefetch(s);
+      __builtin_prefetch(s + src_stride);
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
+                              d26s16, d27s16, q0s16);
+      __builtin_prefetch(s + src_stride * 2);
+      __builtin_prefetch(s + src_stride * 3);
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
+                              d27s16, d25s16, q0s16);
+
+      d2u16 = vqrshrun_n_s32(q1s32, 7);
+      d3u16 = vqrshrun_n_s32(q2s32, 7);
+      d4u16 = vqrshrun_n_s32(q14s32, 7);
+      d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+      q1u16 = vcombine_u16(d2u16, d3u16);
+      q2u16 = vcombine_u16(d4u16, d5u16);
+
+      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+      vst1_lane_u32((uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 0);
+      d += dst_stride;
+      vst1_lane_u32((uint32_t *)d, d3u32, 1);
+      d += dst_stride;
+
+      q8u16 = q10u16;
+      d18s16 = d22s16;
+      d19s16 = d24s16;
+      q10u16 = q13u16;
+      d22s16 = d25s16;
+    }
+  }
+  return;
+}
--- a/aom_dsp/arm/aom_convolve8_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve8_neon_asm.asm
@@ -1,11 +1,14 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
 ;


@@ -14,11 +17,11 @@
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
+    ; AV1_FILTER_WEIGHT == 128
+    ; AV1_FILTER_SHIFT == 7

-    EXPORT  |vpx_convolve8_horiz_neon|
-    EXPORT  |vpx_convolve8_vert_neon|
+    EXPORT  |aom_convolve8_horiz_neon|
+    EXPORT  |aom_convolve8_vert_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -42,24 +45,23 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
+; sp[]const int16_t *filter_x
+; sp[]int x_step_q4
+; sp[]const int16_t *filter_y ; unused
 ; sp[]int y_step_q4           ; unused
 ; sp[]int w
 ; sp[]int h

-|vpx_convolve8_horiz_neon| PROC
+|aom_convolve8_horiz_neon| PROC
    push            {r4-r10, lr}

    sub             r0, r0, #3              ; adjust for taps

-    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
-    add             r4, r5, lsl #4
-    ldrd            r6, r7, [sp, #52]       ; w, h
+    ldr             r5, [sp, #32]           ; filter_x
+    ldr             r6, [sp, #48]           ; w
+    ldr             r7, [sp, #52]           ; h

-    vld1.s16        {q0}, [r4]              ; filter
+    vld1.s16        {q0}, [r5]              ; filter_x

    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -73,7 +75,7 @@

    mov             r10, r6                 ; w loop counter

-vpx_convolve8_loop_horiz_v
+aom_convolve8_loop_horiz_v
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
@@ -96,7 +98,7 @@ vpx_convolve8_loop_horiz_v

    add             r0, r0, #3

-vpx_convolve8_loop_horiz
+aom_convolve8_loop_horiz
    add             r5, r0, #64

    vld1.32         {d28[]}, [r0], r1
@@ -120,7 +122,7 @@ vpx_convolve8_loop_horiz

    pld             [r5, r1, lsl #1]

-    ; src[] * filter
+    ; src[] * filter_x
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -154,38 +156,36 @@ vpx_convolve8_loop_horiz
    vmov            q9,  q13

    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_horiz
+    bgt             aom_convolve8_loop_horiz

    ; outer loop
    mov             r6, r10                 ; restore w counter
    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_loop_horiz_v
+    bgt aom_convolve8_loop_horiz_v

    pop             {r4-r10, pc}

    ENDP

-|vpx_convolve8_vert_neon| PROC
+|aom_convolve8_vert_neon| PROC
    push            {r4-r8, lr}

    ; adjust for taps
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #24]           ; filter
-    ldr             r5, [sp, #36]           ; y0_q4
-    add             r4, r5, lsl #4
-    ldr             r6, [sp, #44]           ; w
-    ldr             lr, [sp, #48]           ; h
+    ldr             r4, [sp, #32]           ; filter_y
+    ldr             r6, [sp, #40]           ; w
+    ldr             lr, [sp, #44]           ; h

-    vld1.s16        {q0}, [r4]              ; filter
+    vld1.s16        {q0}, [r4]              ; filter_y

    lsl             r1, r1, #1
    lsl             r3, r3, #1

-vpx_convolve8_loop_vert_h
+aom_convolve8_loop_vert_h
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
@@ -205,7 +205,7 @@ vpx_convolve8_loop_vert_h
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22

-vpx_convolve8_loop_vert
+aom_convolve8_loop_vert
    ; always process a 4x4 block at a time
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
@@ -219,7 +219,7 @@ vpx_convolve8_loop_vert
    pld             [r5]
    pld             [r8]

-    ; src[] * filter
+    ; src[] * filter_y
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r5, r3]
@@ -259,13 +259,13 @@ vpx_convolve8_loop_vert
    vmov            d22, d25

    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_loop_vert
+    bgt             aom_convolve8_loop_vert

    ; outer loop
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_vert_h
+    bgt             aom_convolve8_loop_vert_h

    pop             {r4-r8, pc}

--- a/aom_dsp/arm/aom_convolve_avg_neon.c
+++ b/aom_dsp/arm/aom_convolve_avg_neon.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+void aom_convolve_avg_neon(const uint8_t *src,    // r0
+                           ptrdiff_t src_stride,  // r1
+                           uint8_t *dst,          // r2
+                           ptrdiff_t dst_stride,  // r3
+                           const int16_t *filter_x, int filter_x_stride,
+                           const int16_t *filter_y, int filter_y_stride, int w,
+                           int h) {
+  uint8_t *d;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint32x2_t d0u32, d2u32;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  d = dst;
+  if (w > 32) {  // avg64
+    for (; h > 0; h -= 1) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
+      src += src_stride;
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
+      q10u8 = vld1q_u8(d + 32);
+      q11u8 = vld1q_u8(d + 48);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // avg32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q8u8 = vld1q_u8(d);
+      q9u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+      q10u8 = vld1q_u8(d);
+      q11u8 = vld1q_u8(d + 16);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q8u8);
+      q1u8 = vrhaddq_u8(q1u8, q9u8);
+      q2u8 = vrhaddq_u8(q2u8, q10u8);
+      q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // avg16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+      q2u8 = vld1q_u8(d);
+      d += dst_stride;
+      q3u8 = vld1q_u8(d);
+      d += dst_stride;
+
+      q0u8 = vrhaddq_u8(q0u8, q2u8);
+      q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // avg8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d1u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(d);
+      d += dst_stride;
+      d3u8 = vld1_u8(d);
+      d += dst_stride;
+
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+      vst1_u8(dst, vget_low_u8(q0u8));
+      dst += dst_stride;
+      vst1_u8(dst, vget_high_u8(q0u8));
+      dst += dst_stride;
+    }
+  } else {  // avg4
+    for (; h > 0; h -= 2) {
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+      src += src_stride;
+      d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+      src += src_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+      d += dst_stride;
+      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+      d += dst_stride;
+
+      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
+
+      d0u32 = vreinterpret_u32_u8(d0u8);
+      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+      dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+      dst += dst_stride;
+    }
+  }
+  return;
+}
--- a/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
@@ -1,23 +1,26 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;

-    EXPORT  |vpx_convolve_avg_neon|
+;
+
+    EXPORT  |aom_convolve_avg_neon|
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

-|vpx_convolve_avg_neon| PROC
+|aom_convolve_avg_neon| PROC
    push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #36]
+    ldrd                r4, r5, [sp, #32]
    mov                 r6, r2

    cmp                 r4, #32
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+void aom_convolve_copy_neon(const uint8_t *src,    // r0
+                            ptrdiff_t src_stride,  // r1
+                            uint8_t *dst,          // r2
+                            ptrdiff_t dst_stride,  // r3
+                            const int16_t *filter_x, int filter_x_stride,
+                            const int16_t *filter_y, int filter_y_stride, int w,
+                            int h) {
+  uint8x8_t d0u8, d2u8;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  if (w > 32) {  // copy64
+    for (; h > 0; h--) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      q2u8 = vld1q_u8(src + 32);
+      q3u8 = vld1q_u8(src + 48);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      vst1q_u8(dst + 32, q2u8);
+      vst1q_u8(dst + 48, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w == 32) {  // copy32
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      q1u8 = vld1q_u8(src + 16);
+      src += src_stride;
+      q2u8 = vld1q_u8(src);
+      q3u8 = vld1q_u8(src + 16);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q1u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q2u8);
+      vst1q_u8(dst + 16, q3u8);
+      dst += dst_stride;
+    }
+  } else if (w > 8) {  // copy16
+    for (; h > 0; h -= 2) {
+      q0u8 = vld1q_u8(src);
+      src += src_stride;
+      q1u8 = vld1q_u8(src);
+      src += src_stride;
+
+      vst1q_u8(dst, q0u8);
+      dst += dst_stride;
+      vst1q_u8(dst, q1u8);
+      dst += dst_stride;
+    }
+  } else if (w == 8) {  // copy8
+    for (; h > 0; h -= 2) {
+      d0u8 = vld1_u8(src);
+      src += src_stride;
+      d2u8 = vld1_u8(src);
+      src += src_stride;
+
+      vst1_u8(dst, d0u8);
+      dst += dst_stride;
+      vst1_u8(dst, d2u8);
+      dst += dst_stride;
+    }
+  } else {  // copy4
+    for (; h > 0; h--) {
+      *(uint32_t *)dst = *(const uint32_t *)src;
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+  return;
+}
--- a/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
@@ -1,23 +1,26 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;

-    EXPORT  |vpx_convolve_copy_neon|
+;
+
+    EXPORT  |aom_convolve_copy_neon|
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

-|vpx_convolve_copy_neon| PROC
+|aom_convolve_copy_neon| PROC
    push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #32]
+    ldrd                r4, r5, [sp, #28]

    cmp                 r4, #32
    bgt                 copy64
--- a/aom_dsp/arm/aom_convolve_neon.c
+++ b/aom_dsp/arm/aom_convolve_neon.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
+                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
+   */
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+
+  // Account for the vertical phase needing 3 lines prior and 4 lines post
+  int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. The neon implementation will ignore the
+   * given height and filter a multiple of 4 lines. Since this goes in to
+   * the temp buffer which has lots of extra room and is subsequently discarded
+   * this is safe if somewhat less than ideal.
+   */
+  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w,
+                           intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                          x_step_q4, filter_y, y_step_q4, w, h);
+}
+
+void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* This implementation has the same issues as above. In addition, we only want
+   * to average the values after both passes.
+   */
+  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w,
+                           intermediate_height);
+  aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+}
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -1,62 +1,77 @@
 /*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

 #include <arm_neon.h>
 #include <assert.h>

-#include "./vpx_dsp_rtcd.h"
-#include "./vpx_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"

-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-#include "vpx_dsp/arm/sum_neon.h"
+#include "aom/aom_integer.h"

-uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
-  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
-  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
-  const uint32x2_t d = horizontal_add_uint16x8(c);
-  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
 }

-uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
-  int i;
-  uint8x8_t b, c;
-  uint16x8_t sum;
-  uint32x2_t d;
-  b = vld1_u8(a);
-  a += a_stride;
-  c = vld1_u8(a);
-  a += a_stride;
-  sum = vaddl_u8(b, c);
-
-  for (i = 0; i < 6; ++i) {
-    const uint8x8_t d = vld1_u8(a);
-    a += a_stride;
-    sum = vaddw_u8(sum, d);
+unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) {
+  uint16x8_t v_sum;
+  uint32x2_t v_s0 = vdup_n_u32(0);
+  uint32x2_t v_s1 = vdup_n_u32(0);
+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
 }

-  d = horizontal_add_uint16x8(sum);
+unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);

-  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }

 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_neon(const tran_low_t *coeff, int length) {
+int aom_satd_neon(const int16_t *coeff, int length) {
  const int16x4_t zero = vdup_n_s16(0);
  int32x4_t accum = vdupq_n_s32(0);

  do {
-    const int16x8_t src0 = load_tran_low_to_s16q(coeff);
-    const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
    accum = vabal_s16(accum, vget_low_s16(src0), zero);
    accum = vabal_s16(accum, vget_high_s16(src0), zero);
    accum = vabal_s16(accum, vget_low_s16(src8), zero);
@@ -75,7 +90,7 @@ int vpx_satd_neon(const tran_low_t *coeff, int length) {
  }
 }

-void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                          const int ref_stride, const int height) {
  int i;
  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
@@ -128,7 +143,7 @@ void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
 }

-int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) {
  int i;
  uint16x8_t vec_sum = vdupq_n_u16(0);

@@ -139,13 +154,12 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
    ref += 16;
  }

-  return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)),
-                       0);
+  return horizontal_add_u16x8(vec_sum);
 }

 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
-int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+int aom_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
  int width = 4 << bwl;
  int32x4_t sse = vdupq_n_s32(0);
  int16x8_t total = vdupq_n_s16(0);
@@ -170,7 +184,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {

  {
    // Note: 'total''s pairwise addition could be implemented similarly to
-    // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired
+    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
    // with the summation of 'sse' performed better on a Cortex-A15.
    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
@@ -185,7 +199,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
  }
 }

-void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                         int b_stride, int *min, int *max) {
  // Load and concatenate.
  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
--- a/aom_dsp/arm/bilinear_filter_media.asm
+++ b/aom_dsp/arm/bilinear_filter_media.asm
@@ -0,0 +1,240 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+    EXPORT  |aom_filter_block2d_bil_first_pass_media|
+    EXPORT  |aom_filter_block2d_bil_second_pass_media|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *aom_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|aom_filter_block2d_bil_first_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; aom_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |aom_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *aom_filter
+;---------------------------------
+|aom_filter_block2d_bil_second_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; aom_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |aom_filter_block2d_second_pass_media|
+
+    END
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -1,23 +1,20 @@
 /*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

 #include <arm_neon.h>

-#include "./vpx_config.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"

-void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
-                      int stride) {
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
  int i;
  // stage 1
  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -48,18 +45,18 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
    {
      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -77,10 +74,10 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
    // Stage 2
    v_x0 = vsubq_s16(v_s6, v_s5);
    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
    {
      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -95,22 +92,22 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
      v_x3 = vaddq_s16(v_s7, cd);
    }
    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
    {
      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -126,8 +123,6 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
    }
    // transpose 8x8
-    // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-    // columns.
    {
      // 00 01 02 03 40 41 42 43
      // 10 11 12 13 50 51 52 53
@@ -176,7 +171,7 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
    }
  }  // for
  {
-    // from vpx_dct_sse2.c
+    // from aom_dct_sse2.c
    // Post-condition (division by two)
    //    division of two 16 bits signed numbers using shifts
    //    n / 2 = (n - (n >> 15)) >> 1
@@ -197,13 +192,30 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
    input_6 = vhsubq_s16(input_6, sign_in6);
    input_7 = vhsubq_s16(input_7, sign_in7);
    // store results
-    store_s16q_to_tran_low(final_output + 0 * 8, input_0);
-    store_s16q_to_tran_low(final_output + 1 * 8, input_1);
-    store_s16q_to_tran_low(final_output + 2 * 8, input_2);
-    store_s16q_to_tran_low(final_output + 3 * 8, input_3);
-    store_s16q_to_tran_low(final_output + 4 * 8, input_4);
-    store_s16q_to_tran_low(final_output + 5 * 8, input_5);
-    store_s16q_to_tran_low(final_output + 6 * 8, input_6);
-    store_s16q_to_tran_low(final_output + 7 * 8, input_7);
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
+void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
  }
 }
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
+// reversing transpose order which may make it easier for the compiler to
+// reconcile the vtrn.64 moves.
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                         int16x8_t *a6, int16x8_t *a7) {
+  // Swap 64 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 08 09 10 11 12 13 14 15
+  // a2: 16 17 18 19 20 21 22 23
+  // a3: 24 25 26 27 28 29 30 31
+  // a4: 32 33 34 35 36 37 38 39
+  // a5: 40 41 42 43 44 45 46 47
+  // a6: 48 49 50 51 52 53 54 55
+  // a7: 56 57 58 59 60 61 62 63
+  // to:
+  // a04_lo: 00 01 02 03 32 33 34 35
+  // a15_lo: 08 09 10 11 40 41 42 43
+  // a26_lo: 16 17 18 19 48 49 50 51
+  // a37_lo: 24 25 26 27 56 57 58 59
+  // a04_hi: 04 05 06 07 36 37 38 39
+  // a15_hi: 12 13 14 15 44 45 46 47
+  // a26_hi: 20 21 22 23 52 53 54 55
+  // a37_hi: 28 29 30 31 60 61 62 63
+  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
+  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
+  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
+  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
+  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
+  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
+  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
+  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
+
+  // Swap 32 bit elements resulting in:
+  // a0246_lo:
+  // 00 01 16 17 32 33 48 49
+  // 02 03 18 19 34 35 50 51
+  // a1357_lo:
+  // 08 09 24 25 40 41 56 57
+  // 10 11 26 27 42 43 58 59
+  // a0246_hi:
+  // 04 05 20 21 36 37 52 53
+  // 06 07 22 23 38 39 54 55
+  // a1657_hi:
+  // 12 13 28 29 44 45 60 61
+  // 14 15 30 31 46 47 62 63
+  const int32x4x2_t a0246_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
+  const int32x4x2_t a1357_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
+  const int32x4x2_t a0246_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
+  const int32x4x2_t a1357_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
+
+  // Swap 16 bit elements resulting in:
+  // b0:
+  // 00 08 16 24 32 40 48 56
+  // 01 09 17 25 33 41 49 57
+  // b1:
+  // 02 10 18 26 34 42 50 58
+  // 03 11 19 27 35 43 51 59
+  // b2:
+  // 04 12 20 28 36 44 52 60
+  // 05 13 21 29 37 45 53 61
+  // b3:
+  // 06 14 22 30 38 46 54 62
+  // 07 15 23 31 39 47 55 63
+  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
+  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
+  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
+  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
+
+  *a0 = b0.val[0];
+  *a1 = b0.val[1];
+  *a2 = b1.val[0];
+  *a3 = b1.val[1];
+  *a4 = b2.val[0];
+  *a5 = b2.val[1];
+  *a6 = b3.val[0];
+  *a7 = b3.val[1];
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+                           int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int i;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
--- a/aom_dsp/arm/idct16x16_1_add_neon.asm
+++ b/aom_dsp/arm/idct16x16_1_add_neon.asm
@@ -0,0 +1,201 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+
+    EXPORT  |aom_idct16x16_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;                                    int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|aom_idct16x16_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asr              r0, r0, #6                ; >> 6
+
+    vdup.s16         q0, r0                    ; duplicate a1
+    mov              r0, #8
+    sub              r2, #8
+
+    ; load destination data row0 - row3
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row4 - row7
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row8 - row11
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    ; load destination data row12 - row15
+    vld1.64          {d2}, [r1], r0
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r0
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r0
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r0
+    vld1.64          {d17}, [r1], r2
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r0
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r0
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |aom_idct16x16_1_add_neon|
+
+    END
--- a/aom_dsp/arm/idct16x16_1_add_neon.c
+++ b/aom_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d2u8, d3u8, d30u8, d31u8;
+  uint64x1_t d2u64, d3u64, d4u64, d5u64;
+  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, j, a1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  q0s16 = vdupq_n_s16(a1);
+  q0u16 = vreinterpretq_u16_s16(q0s16);
+
+  for (d1 = d2 = dest, i = 0; i < 4; i++) {
+    for (j = 0; j < 2; j++) {
+      d2u64 = vld1_u64((const uint64_t *)d1);
+      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+      d1 += dest_stride;
+      d4u64 = vld1_u64((const uint64_t *)d1);
+      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+      d1 += dest_stride;
+
+      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+      d2 += dest_stride;
+      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+      d2 += dest_stride;
+    }
+  }
+  return;
+}
--- a/aom_dsp/arm/idct16x16_add_neon.asm
+++ b/aom_dsp/arm/idct16x16_add_neon.asm
--- a/aom_dsp/arm/idct16x16_add_neon.c
+++ b/aom_dsp/arm/idct16x16_add_neon.c
--- a/aom_dsp/arm/idct16x16_neon.c
+++ b/aom_dsp/arm/idct16x16_neon.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+
+void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
+                                      int output_stride);
+void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
+                                      int16_t *pass1Output, int16_t skip_adding,
+                                      uint8_t *dest, int dest_stride);
+void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
+                                     int output_stride);
+void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
+                                     int16_t *pass1Output, int16_t skip_adding,
+                                     uint8_t *dest, int dest_stride);
+
+#if HAVE_NEON_ASM
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void aom_push_neon(int64_t *store);
+extern void aom_pop_neon(int64_t *store);
+#endif  // HAVE_NEON_ASM
+
+void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
+                                int dest_stride) {
+#if HAVE_NEON_ASM
+  int64_t store_reg[8];
+#endif
+  int16_t pass1_output[16 * 16] = { 0 };
+  int16_t row_idct_output[16 * 16] = { 0 };
+
+#if HAVE_NEON_ASM
+  // save d8-d15 register values.
+  aom_push_neon(store_reg);
+#endif
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+                                   dest, dest_stride);
+
+  /* Parallel idct on the lower 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
+                                   pass1_output, 0, dest, dest_stride);
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+                                   pass1_output, 1, dest, dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+                                   row_idct_output + 8, pass1_output, 1,
+                                   dest + 8, dest_stride);
+
+#if HAVE_NEON_ASM
+  // restore d8-d15 register values.
+  aom_pop_neon(store_reg);
+#endif
+
+  return;
+}
+
+void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
+                               int dest_stride) {
+#if HAVE_NEON_ASM
+  int64_t store_reg[8];
+#endif
+  int16_t pass1_output[16 * 16] = { 0 };
+  int16_t row_idct_output[16 * 16] = { 0 };
+
+#if HAVE_NEON_ASM
+  // save d8-d15 register values.
+  aom_push_neon(store_reg);
+#endif
+
+  /* Parallel idct on the upper 8 rows */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7
+  // which will be saved into row_idct_output.
+  aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
+                                  dest, dest_stride);
+
+  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
+
+  /* Parallel idct on the left 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
+                                   pass1_output, 1, dest, dest_stride);
+
+  /* Parallel idct on the right 8 columns */
+  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
+  // stage 6 result in pass1_output.
+  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
+
+  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
+  // with result in pass1(pass1_output) to calculate final result in stage 7.
+  // Then add the result to the destination data.
+  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
+                                   row_idct_output + 8, pass1_output, 1,
+                                   dest + 8, dest_stride);
+
+#if HAVE_NEON_ASM
+  // restore d8-d15 register values.
+  aom_pop_neon(store_reg);
+#endif
+
+  return;
+}
--- a/aom_dsp/arm/idct32x32_1_add_neon.asm
+++ b/aom_dsp/arm/idct32x32_1_add_neon.asm
@@ -0,0 +1,147 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+    EXPORT  |aom_idct32x32_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ;TODO(hkuang): put the following macros in a seperate
+    ;file so other idct function could also use them.
+    MACRO
+    LD_16x8          $src, $stride
+    vld1.8           {q8}, [$src], $stride
+    vld1.8           {q9}, [$src], $stride
+    vld1.8           {q10}, [$src], $stride
+    vld1.8           {q11}, [$src], $stride
+    vld1.8           {q12}, [$src], $stride
+    vld1.8           {q13}, [$src], $stride
+    vld1.8           {q14}, [$src], $stride
+    vld1.8           {q15}, [$src], $stride
+    MEND
+
+    MACRO
+    ADD_DIFF_16x8    $diff
+    vqadd.u8         q8, q8, $diff
+    vqadd.u8         q9, q9, $diff
+    vqadd.u8         q10, q10, $diff
+    vqadd.u8         q11, q11, $diff
+    vqadd.u8         q12, q12, $diff
+    vqadd.u8         q13, q13, $diff
+    vqadd.u8         q14, q14, $diff
+    vqadd.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    SUB_DIFF_16x8    $diff
+    vqsub.u8         q8, q8, $diff
+    vqsub.u8         q9, q9, $diff
+    vqsub.u8         q10, q10, $diff
+    vqsub.u8         q11, q11, $diff
+    vqsub.u8         q12, q12, $diff
+    vqsub.u8         q13, q13, $diff
+    vqsub.u8         q14, q14, $diff
+    vqsub.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    ST_16x8          $dst, $stride
+    vst1.8           {q8}, [$dst], $stride
+    vst1.8           {q9}, [$dst], $stride
+    vst1.8           {q10},[$dst], $stride
+    vst1.8           {q11},[$dst], $stride
+    vst1.8           {q12},[$dst], $stride
+    vst1.8           {q13},[$dst], $stride
+    vst1.8           {q14},[$dst], $stride
+    vst1.8           {q15},[$dst], $stride
+    MEND
+
+;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+;                              int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+
+|aom_idct32x32_1_add_neon| PROC
+    push             {lr}
+    pld              [r1]
+    add              r3, r1, #16               ; r3 dest + 16 for second loop
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asrs             r0, r0, #6                ; >> 6
+    bge              diff_positive_32_32
+
+diff_negative_32_32
+    neg              r0, r0
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_negative_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_negative_32_32_loop
+    pop              {pc}
+
+diff_positive_32_32
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_positive_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_positive_32_32_loop
+    pop              {pc}
+
+    ENDP             ; |aom_idct32x32_1_add_neon|
+    END
--- a/aom_dsp/arm/idct32x32_1_add_neon.c
+++ b/aom_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+                           uint8x16_t *q9u8, uint8x16_t *q10u8,
+                           uint8x16_t *q11u8, uint8x16_t *q12u8,
+                           uint8x16_t *q13u8, uint8x16_t *q14u8,
+                           uint8x16_t *q15u8) {
+  *q8u8 = vld1q_u8(d);
+  d += d_stride;
+  *q9u8 = vld1q_u8(d);
+  d += d_stride;
+  *q10u8 = vld1q_u8(d);
+  d += d_stride;
+  *q11u8 = vld1q_u8(d);
+  d += d_stride;
+  *q12u8 = vld1q_u8(d);
+  d += d_stride;
+  *q13u8 = vld1q_u8(d);
+  d += d_stride;
+  *q14u8 = vld1q_u8(d);
+  d += d_stride;
+  *q15u8 = vld1q_u8(d);
+  return;
+}
+
+static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
+                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
+                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
+                                 uint8x16_t *q15u8) {
+  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+  return;
+}
+
+static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
+                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
+                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
+                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
+                                 uint8x16_t *q15u8) {
+  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+  return;
+}
+
+static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
+                           uint8x16_t *q9u8, uint8x16_t *q10u8,
+                           uint8x16_t *q11u8, uint8x16_t *q12u8,
+                           uint8x16_t *q13u8, uint8x16_t *q14u8,
+                           uint8x16_t *q15u8) {
+  vst1q_u8(d, *q8u8);
+  d += d_stride;
+  vst1q_u8(d, *q9u8);
+  d += d_stride;
+  vst1q_u8(d, *q10u8);
+  d += d_stride;
+  vst1q_u8(d, *q11u8);
+  d += d_stride;
+  vst1q_u8(d, *q12u8);
+  d += d_stride;
+  vst1q_u8(d, *q13u8);
+  d += d_stride;
+  vst1q_u8(d, *q14u8);
+  d += d_stride;
+  vst1q_u8(d, *q15u8);
+  return;
+}
+
+void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+  int i, j, dest_stride8;
+  uint8_t *d;
+  int16_t a1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  dest_stride8 = dest_stride * 8;
+  if (a1 >= 0) {  // diff_positive_32_32
+    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+    q0u8 = vdupq_n_u8(a1);
+    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
+      d = dest;
+      for (j = 0; j < 4; j++) {
+        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                      &q14u8, &q15u8);
+        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        d += dest_stride8;
+      }
+    }
+  } else {  // diff_negative_32_32
+    a1 = -a1;
+    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+    q0u8 = vdupq_n_u8(a1);
+    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
+      d = dest;
+      for (j = 0; j < 4; j++) {
+        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                      &q14u8, &q15u8);
+        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
+                &q14u8, &q15u8);
+        d += dest_stride8;
+      }
+    }
+  }
+  return;
+}
--- a/aom_dsp/arm/idct32x32_add_neon.asm
+++ b/aom_dsp/arm/idct32x32_add_neon.asm
--- a/aom_dsp/arm/idct32x32_add_neon.c
+++ b/aom_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+  q14s16 = vld1q_s16(trans_buf + first * 8);      \
+  q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+  qA = vld1q_s16(out + first * 32);                   \
+  qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+  vst1q_s16(out + first * 32, qA);                   \
+  vst1q_s16(out + second * 32, qB);
+
+#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
+                                                  int stride, int16x8_t q6s16,
+                                                  int16x8_t q7s16,
+                                                  int16x8_t q8s16,
+                                                  int16x8_t q9s16) {
+  int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+  d8s16 = vld1_s16((int16_t *)p1);
+  p1 += stride;
+  d11s16 = vld1_s16((int16_t *)p2);
+  p2 -= stride;
+  d9s16 = vld1_s16((int16_t *)p1);
+  d10s16 = vld1_s16((int16_t *)p2);
+
+  q7s16 = vrshrq_n_s16(q7s16, 6);
+  q8s16 = vrshrq_n_s16(q8s16, 6);
+  q9s16 = vrshrq_n_s16(q9s16, 6);
+  q6s16 = vrshrq_n_s16(q6s16, 6);
+
+  q7s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
+  q8s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
+  q9s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
+  q6s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
+
+  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+  vst1_s16((int16_t *)p1, d9s16);
+  p1 -= stride;
+  vst1_s16((int16_t *)p2, d10s16);
+  p2 += stride;
+  vst1_s16((int16_t *)p1, d8s16);
+  vst1_s16((int16_t *)p2, d11s16);
+  return;
+}
+
+#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
+  ;                                           \
+  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
+                                                   int stride, int16x8_t q4s16,
+                                                   int16x8_t q5s16,
+                                                   int16x8_t q6s16,
+                                                   int16x8_t q7s16) {
+  int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+  d4s16 = vld1_s16((int16_t *)p1);
+  p1 += stride;
+  d7s16 = vld1_s16((int16_t *)p2);
+  p2 -= stride;
+  d5s16 = vld1_s16((int16_t *)p1);
+  d6s16 = vld1_s16((int16_t *)p2);
+
+  q5s16 = vrshrq_n_s16(q5s16, 6);
+  q6s16 = vrshrq_n_s16(q6s16, 6);
+  q7s16 = vrshrq_n_s16(q7s16, 6);
+  q4s16 = vrshrq_n_s16(q4s16, 6);
+
+  q5s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
+  q6s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
+  q7s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
+  q4s16 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
+
+  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+  vst1_s16((int16_t *)p1, d5s16);
+  p1 -= stride;
+  vst1_s16((int16_t *)p2, d6s16);
+  p2 += stride;
+  vst1_s16((int16_t *)p2, d7s16);
+  vst1_s16((int16_t *)p1, d4s16);
+  return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
+                                int16_t first_const, int16_t second_const,
+                                int16x8_t *qAs16, int16x8_t *qBs16) {
+  int16x4_t d30s16, d31s16;
+  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+  int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+  dCs16 = vget_low_s16(q14s16);
+  dDs16 = vget_high_s16(q14s16);
+  dAs16 = vget_low_s16(q13s16);
+  dBs16 = vget_high_s16(q13s16);
+
+  d30s16 = vdup_n_s16(first_const);
+  d31s16 = vdup_n_s16(second_const);
+
+  q8s32 = vmull_s16(dCs16, d30s16);
+  q10s32 = vmull_s16(dAs16, d31s16);
+  q9s32 = vmull_s16(dDs16, d30s16);
+  q11s32 = vmull_s16(dBs16, d31s16);
+  q12s32 = vmull_s16(dCs16, d31s16);
+
+  q8s32 = vsubq_s32(q8s32, q10s32);
+  q9s32 = vsubq_s32(q9s32, q11s32);
+
+  q10s32 = vmull_s16(dDs16, d31s16);
+  q11s32 = vmull_s16(dAs16, d30s16);
+  q15s32 = vmull_s16(dBs16, d30s16);
+
+  q11s32 = vaddq_s32(q12s32, q11s32);
+  q10s32 = vaddq_s32(q10s32, q15s32);
+
+  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
+  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
+  return;
+}
+
+static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
+  int16_t *in;
+  int i;
+  const int stride = 32;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  for (i = 0; i < 4; i++, input += 8) {
+    in = input;
+    q8s16 = vld1q_s16(in);
+    in += stride;
+    q9s16 = vld1q_s16(in);
+    in += stride;
+    q10s16 = vld1q_s16(in);
+    in += stride;
+    q11s16 = vld1q_s16(in);
+    in += stride;
+    q12s16 = vld1q_s16(in);
+    in += stride;
+    q13s16 = vld1q_s16(in);
+    in += stride;
+    q14s16 = vld1q_s16(in);
+    in += stride;
+    q15s16 = vld1q_s16(in);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+    q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+    q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    q12s16 = vcombine_s16(d17s16, d25s16);
+    q13s16 = vcombine_s16(d19s16, d27s16);
+    q14s16 = vcombine_s16(d21s16, d29s16);
+    q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
+    q1x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
+    q2x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
+    q3x2s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    vst1q_s16(t_buf, q0x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q0x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q1x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q1x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q2x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q2x2s16.val[1]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q3x2s16.val[0]);
+    t_buf += 8;
+    vst1q_s16(t_buf, q3x2s16.val[1]);
+    t_buf += 8;
+  }
+  return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
+                                             int16x8_t q3s16, int16x8_t q6s16,
+                                             int16x8_t q7s16, int16x8_t q8s16,
+                                             int16x8_t q9s16, int16x8_t q10s16,
+                                             int16x8_t q11s16, int16x8_t q12s16,
+                                             int16x8_t q13s16, int16x8_t q14s16,
+                                             int16x8_t q15s16) {
+  int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+  q2s16 = vaddq_s16(q10s16, q1s16);
+  q3s16 = vaddq_s16(q11s16, q0s16);
+  q4s16 = vsubq_s16(q11s16, q0s16);
+  q5s16 = vsubq_s16(q10s16, q1s16);
+
+  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+  q2s16 = vaddq_s16(q12s16, q1s16);
+  q3s16 = vaddq_s16(q13s16, q0s16);
+  q4s16 = vsubq_s16(q13s16, q0s16);
+  q5s16 = vsubq_s16(q12s16, q1s16);
+
+  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+  q2s16 = vaddq_s16(q14s16, q1s16);
+  q3s16 = vaddq_s16(q15s16, q0s16);
+  q4s16 = vsubq_s16(q15s16, q0s16);
+  q5s16 = vsubq_s16(q14s16, q1s16);
+
+  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+  return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
+    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
+    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
+    int16x8_t q14s16, int16x8_t q15s16) {
+  uint8_t *r6 = dest + 31 * stride;
+  uint8_t *r7 = dest /* +  0 * stride*/;
+  uint8_t *r9 = dest + 15 * stride;
+  uint8_t *r10 = dest + 16 * stride;
+  int str2 = stride << 1;
+  int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+  q2s16 = vaddq_s16(q10s16, q1s16);
+  q3s16 = vaddq_s16(q11s16, q0s16);
+  q4s16 = vsubq_s16(q11s16, q0s16);
+  q5s16 = vsubq_s16(q10s16, q1s16);
+
+  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+  q2s16 = vaddq_s16(q12s16, q1s16);
+  q3s16 = vaddq_s16(q13s16, q0s16);
+  q4s16 = vsubq_s16(q13s16, q0s16);
+  q5s16 = vsubq_s16(q12s16, q1s16);
+
+  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+  r10 += str2;
+  r9 -= str2;
+
+  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  r7 += str2;
+  r6 -= str2;
+
+  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+  q2s16 = vaddq_s16(q14s16, q1s16);
+  q3s16 = vaddq_s16(q15s16, q0s16);
+  q4s16 = vsubq_s16(q15s16, q0s16);
+  q5s16 = vsubq_s16(q14s16, q1s16);
+
+  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+  q8s16 = vaddq_s16(q4s16, q1s16);
+  q9s16 = vaddq_s16(q5s16, q0s16);
+  q6s16 = vsubq_s16(q5s16, q0s16);
+  q7s16 = vsubq_s16(q4s16, q1s16);
+  STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+  q4s16 = vaddq_s16(q2s16, q1s16);
+  q5s16 = vaddq_s16(q3s16, q0s16);
+  q6s16 = vsubq_s16(q3s16, q0s16);
+  q7s16 = vsubq_s16(q2s16, q1s16);
+  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+  return;
+}
+
+void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
+  int i, idct32_pass_loop;
+  int16_t trans_buf[32 * 8];
+  int16_t pass1[32 * 32];
+  int16_t pass2[32 * 32];
+  int16_t *out;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
+       idct32_pass_loop++,
+      input = pass1,  // the input of pass2 is the result of pass1
+       out = pass2) {
+    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
+      idct32_transpose_pair(input, trans_buf);
+
+      // -----------------------------------------
+      // BLOCK A: 16-19,28-31
+      // -----------------------------------------
+      // generate 16,17,30,31
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(0, 1, 31)
+      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(31, 17, 15)
+      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+      // part of stage 2
+      q4s16 = vaddq_s16(q0s16, q1s16);
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q6s16 = vaddq_s16(q2s16, q3s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+      // generate 18,19,28,29
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(15, 9, 23)
+      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(23, 25, 7)
+      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+      // part of stage 2
+      q13s16 = vsubq_s16(q3s16, q2s16);
+      q3s16 = vaddq_s16(q3s16, q2s16);
+      q14s16 = vsubq_s16(q1s16, q0s16);
+      q2s16 = vaddq_s16(q1s16, q0s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+      // part of stage 4
+      q8s16 = vaddq_s16(q4s16, q2s16);
+      q9s16 = vaddq_s16(q5s16, q0s16);
+      q10s16 = vaddq_s16(q7s16, q1s16);
+      q15s16 = vaddq_s16(q6s16, q3s16);
+      q13s16 = vsubq_s16(q5s16, q0s16);
+      q14s16 = vsubq_s16(q7s16, q1s16);
+      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+      // part of stage 4
+      q13s16 = vsubq_s16(q4s16, q2s16);
+      q14s16 = vsubq_s16(q6s16, q3s16);
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+      // -----------------------------------------
+      // BLOCK B: 20-23,24-27
+      // -----------------------------------------
+      // generate 20,21,26,27
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(7, 5, 27)
+      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(27, 21, 11)
+      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+      // part of stage 2
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+      // generate 22,23,24,25
+      // part of stage 1
+      LOAD_FROM_TRANSPOSED(11, 13, 19)
+      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(19, 29, 3)
+      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+      // part of stage 2
+      q14s16 = vsubq_s16(q4s16, q5s16);
+      q5s16 = vaddq_s16(q4s16, q5s16);
+      q13s16 = vsubq_s16(q6s16, q7s16);
+      q6s16 = vaddq_s16(q6s16, q7s16);
+      // part of stage 3
+      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+      // part of stage 4
+      q10s16 = vaddq_s16(q7s16, q1s16);
+      q11s16 = vaddq_s16(q5s16, q0s16);
+      q12s16 = vaddq_s16(q6s16, q2s16);
+      q15s16 = vaddq_s16(q4s16, q3s16);
+      // part of stage 6
+      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+      q8s16 = vaddq_s16(q14s16, q11s16);
+      q9s16 = vaddq_s16(q13s16, q10s16);
+      q13s16 = vsubq_s16(q13s16, q10s16);
+      q11s16 = vsubq_s16(q14s16, q11s16);
+      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+      q8s16 = vsubq_s16(q9s16, q12s16);
+      q10s16 = vaddq_s16(q14s16, q15s16);
+      q14s16 = vsubq_s16(q14s16, q15s16);
+      q12s16 = vaddq_s16(q9s16, q12s16);
+      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+      // part of stage 7
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+      q13s16 = q11s16;
+      q14s16 = q8s16;
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+      // part of stage 4
+      q14s16 = vsubq_s16(q5s16, q0s16);
+      q13s16 = vsubq_s16(q6s16, q2s16);
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+      q14s16 = vsubq_s16(q7s16, q1s16);
+      q13s16 = vsubq_s16(q4s16, q3s16);
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+      // part of stage 6
+      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+      q8s16 = vaddq_s16(q14s16, q1s16);
+      q9s16 = vaddq_s16(q13s16, q6s16);
+      q13s16 = vsubq_s16(q13s16, q6s16);
+      q1s16 = vsubq_s16(q14s16, q1s16);
+      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+      q14s16 = vsubq_s16(q8s16, q5s16);
+      q10s16 = vaddq_s16(q8s16, q5s16);
+      q11s16 = vaddq_s16(q9s16, q0s16);
+      q0s16 = vsubq_s16(q9s16, q0s16);
+      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+      // part of stage 7
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
+      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+      // -----------------------------------------
+      // BLOCK C: 8-10,11-15
+      // -----------------------------------------
+      // generate 8,9,14,15
+      // part of stage 2
+      LOAD_FROM_TRANSPOSED(3, 2, 30)
+      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(30, 18, 14)
+      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+      // part of stage 3
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 4
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+      // generate 10,11,12,13
+      // part of stage 2
+      LOAD_FROM_TRANSPOSED(14, 10, 22)
+      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(22, 26, 6)
+      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+      // part of stage 3
+      q14s16 = vsubq_s16(q4s16, q5s16);
+      q5s16 = vaddq_s16(q4s16, q5s16);
+      q13s16 = vsubq_s16(q6s16, q7s16);
+      q6s16 = vaddq_s16(q6s16, q7s16);
+      // part of stage 4
+      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+      // part of stage 5
+      q8s16 = vaddq_s16(q0s16, q5s16);
+      q9s16 = vaddq_s16(q1s16, q7s16);
+      q13s16 = vsubq_s16(q1s16, q7s16);
+      q14s16 = vsubq_s16(q3s16, q4s16);
+      q10s16 = vaddq_s16(q3s16, q4s16);
+      q15s16 = vaddq_s16(q2s16, q6s16);
+      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+      // part of stage 6
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+      q13s16 = vsubq_s16(q0s16, q5s16);
+      q14s16 = vsubq_s16(q2s16, q6s16);
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+      // -----------------------------------------
+      // BLOCK D: 0-3,4-7
+      // -----------------------------------------
+      // generate 4,5,6,7
+      // part of stage 3
+      LOAD_FROM_TRANSPOSED(6, 4, 28)
+      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+      LOAD_FROM_TRANSPOSED(28, 20, 12)
+      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+      // part of stage 4
+      q13s16 = vsubq_s16(q0s16, q1s16);
+      q0s16 = vaddq_s16(q0s16, q1s16);
+      q14s16 = vsubq_s16(q2s16, q3s16);
+      q2s16 = vaddq_s16(q2s16, q3s16);
+      // part of stage 5
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+      // generate 0,1,2,3
+      // part of stage 4
+      LOAD_FROM_TRANSPOSED(12, 0, 16)
+      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+      LOAD_FROM_TRANSPOSED(16, 8, 24)
+      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+      // part of stage 5
+      q4s16 = vaddq_s16(q7s16, q6s16);
+      q7s16 = vsubq_s16(q7s16, q6s16);
+      q6s16 = vsubq_s16(q5s16, q14s16);
+      q5s16 = vaddq_s16(q5s16, q14s16);
+      // part of stage 6
+      q8s16 = vaddq_s16(q4s16, q2s16);
+      q9s16 = vaddq_s16(q5s16, q3s16);
+      q10s16 = vaddq_s16(q6s16, q1s16);
+      q11s16 = vaddq_s16(q7s16, q0s16);
+      q12s16 = vsubq_s16(q7s16, q0s16);
+      q13s16 = vsubq_s16(q6s16, q1s16);
+      q14s16 = vsubq_s16(q5s16, q3s16);
+      q15s16 = vsubq_s16(q4s16, q2s16);
+      // part of stage 7
+      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+      q2s16 = vaddq_s16(q8s16, q1s16);
+      q3s16 = vaddq_s16(q9s16, q0s16);
+      q4s16 = vsubq_s16(q9s16, q0s16);
+      q5s16 = vsubq_s16(q8s16, q1s16);
+      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+      q8s16 = vaddq_s16(q4s16, q1s16);
+      q9s16 = vaddq_s16(q5s16, q0s16);
+      q6s16 = vsubq_s16(q5s16, q0s16);
+      q7s16 = vsubq_s16(q4s16, q1s16);
+
+      if (idct32_pass_loop == 0) {
+        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                                  q10s16, q11s16, q12s16, q13s16, q14s16,
+                                  q15s16);
+      } else {
+        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
+                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
+                                  q14s16, q15s16);
+        dest += 8;
+      }
+    }
+  }
+  return;
+}
--- a/aom_dsp/arm/idct4x4_1_add_neon.asm
+++ b/aom_dsp/arm/idct4x4_1_add_neon.asm
@@ -1,31 +1,36 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;


-    EXPORT  |vpx_idct4x4_1_add_neon|
+
+    EXPORT  |aom_idct4x4_1_add_neon|
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
+;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int stride)
+; r2  int dest_stride)

-|vpx_idct4x4_1_add_neon| PROC
+|aom_idct4x4_1_add_neon| PROC
    ldrsh            r0, [r0]

-    ; cospi_16_64 = 11585
-    movw             r12, #0x2d41
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41

    ; out = dct_const_round_shift(input[0] * cospi_16_64)
    mul              r0, r0, r12               ; input[0] * cospi_16_64
@@ -61,6 +66,6 @@
    vst1.32          {d7[1]}, [r12]

    bx               lr
-    ENDP             ; |vpx_idct4x4_1_add_neon|
+    ENDP             ; |aom_idct4x4_1_add_neon|

    END
--- a/aom_dsp/arm/idct4x4_1_add_neon.c
+++ b/aom_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d6u8;
+  uint32x2_t d2u32 = vdup_n_u32(0);
+  uint16x8_t q8u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, a1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  q0s16 = vdupq_n_s16(a1);
+
+  // dc_only_idct_add
+  d1 = d2 = dest;
+  for (i = 0; i < 2; i++) {
+    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+    d1 += dest_stride;
+    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
+    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+    d2 += dest_stride;
+    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+    d2 += dest_stride;
+  }
+  return;
+}
--- a/aom_dsp/arm/idct4x4_add_neon.asm
+++ b/aom_dsp/arm/idct4x4_add_neon.asm
@@ -1,30 +1,31 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;

-    EXPORT  |vpx_idct4x4_16_add_neon|
+;
+
+    EXPORT  |aom_idct4x4_16_add_neon|
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

-    INCLUDE vpx_dsp/arm/idct_neon.asm.S
-
    AREA     Block, CODE, READONLY ; name this block of code
-;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
+;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
-; r2  int stride)
+; r2  int dest_stride)

-|vpx_idct4x4_16_add_neon| PROC
+|aom_idct4x4_16_add_neon| PROC

    ; The 2D transform is done with two passes which are actually pretty
    ; similar. We first transform the rows. This is done by transposing
@@ -35,15 +36,18 @@
    ; So, two passes of a transpose followed by a column transform.

    ; load the inputs into q8-q9, d16-d19
-    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+    vld1.s16        {q8,q9}, [r0]!

    ; generate scalar constants
-    ; cospi_8_64 = 15137
-    movw            r0, #0x3b21
-    ; cospi_16_64 = 11585
-    movw            r3, #0x2d41
-    ; cospi_24_64 = 6270
-    movw            r12, #0x187e
+    ; cospi_8_64 = 15137 = 0x3b21
+    mov             r0, #0x3b00
+    add             r0, #0x21
+    ; cospi_16_64 = 11585 = 0x2d41
+    mov             r3, #0x2d00
+    add             r3, #0x41
+    ; cospi_24_64 = 6270 = 0x 187e
+    mov             r12, #0x1800
+    add             r12, #0x7e

    ; transpose the input data
    ; 00 01 02 03   d16
@@ -72,15 +76,16 @@
    ; do the transform on transposed rows

    ; stage 1
+    vadd.s16  d23, d16, d18         ; (input[0] + input[2])
+    vsub.s16  d24, d16, d18         ; (input[0] - input[2])
+
    vmull.s16 q15, d17, d22         ; input[1] * cospi_24_64
    vmull.s16 q1,  d17, d20         ; input[1] * cospi_8_64

    ; (input[0] + input[2]) * cospi_16_64;
    ; (input[0] - input[2]) * cospi_16_64;
-    vmull.s16 q8,  d16, d21
-    vmull.s16 q14, d18, d21
-    vadd.s32  q13, q8,  q14
-    vsub.s32  q14, q8,  q14
+    vmull.s16 q13, d23, d21
+    vmull.s16 q14, d24, d21

    ; input[1] * cospi_24_64 - input[3] * cospi_8_64;
    ; input[1] * cospi_8_64  + input[3] * cospi_24_64;
@@ -88,10 +93,10 @@
    vmlal.s16 q1,  d19, d22

    ; dct_const_round_shift
-    vrshrn.s32 d26, q13, #14
-    vrshrn.s32 d27, q14, #14
-    vrshrn.s32 d29, q15, #14
-    vrshrn.s32 d28, q1,  #14
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14

    ; stage 2
    ; output[0] = step[0] + step[3];
@@ -139,10 +144,10 @@
    vmlal.s16 q1,  d19, d22

    ; dct_const_round_shift
-    vrshrn.s32 d26, q13, #14
-    vrshrn.s32 d27, q14, #14
-    vrshrn.s32 d29, q15, #14
-    vrshrn.s32 d28, q1,  #14
+    vqrshrn.s32 d26, q13, #14
+    vqrshrn.s32 d27, q14, #14
+    vqrshrn.s32 d29, q15, #14
+    vqrshrn.s32 d28, q1,  #14

    ; stage 2
    ; output[0] = step[0] + step[3];
@@ -167,7 +172,7 @@
    vld1.32 {d27[1]}, [r1], r2
    vld1.32 {d27[0]}, [r1]  ; no post-increment

-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
+    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
    vaddw.u8 q8, q8, d26
    vaddw.u8 q9, q9, d27

@@ -183,6 +188,6 @@
    vst1.32 {d26[1]}, [r1], r2
    vst1.32 {d26[0]}, [r1]  ; no post-increment
    bx              lr
-    ENDP  ; |vpx_idct4x4_16_add_neon|
+    ENDP  ; |aom_idct4x4_16_add_neon|

    END
--- a/aom_dsp/arm/idct4x4_add_neon.c
+++ b/aom_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d26u8, d27u8;
+  uint32x2_t d26u32, d27u32;
+  uint16x8_t q8u16, q9u16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+  int16x8_t q8s16, q9s16, q13s16, q14s16;
+  int32x4_t q1s32, q13s32, q14s32, q15s32;
+  int16x4x2_t d0x2s16, d1x2s16;
+  int32x4x2_t q0x2s32;
+  uint8_t *d;
+
+  d26u32 = d27u32 = vdup_n_u32(0);
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_low_s16(q9s16);
+  d19s16 = vget_high_s16(q9s16);
+
+  d0x2s16 = vtrn_s16(d16s16, d17s16);
+  d1x2s16 = vtrn_s16(d18s16, d19s16);
+  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
+  d21s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+  d22s16 = vdup_n_s16((int16_t)cospi_24_64);
+
+  // stage 1
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, d22s16);
+  q1s32 = vmull_s16(d17s16, d20s16);
+  q13s32 = vmull_s16(d23s16, d21s16);
+  q14s32 = vmull_s16(d24s16, d21s16);
+
+  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+  d26s16 = vqrshrn_n_s32(q13s32, 14);
+  d27s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d28s16 = vqrshrn_n_s32(q1s32, 14);
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+
+  // stage 2
+  q8s16 = vaddq_s16(q13s16, q14s16);
+  q9s16 = vsubq_s16(q13s16, q14s16);
+
+  d16s16 = vget_low_s16(q8s16);
+  d17s16 = vget_high_s16(q8s16);
+  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
+  d19s16 = vget_low_s16(q9s16);
+
+  d0x2s16 = vtrn_s16(d16s16, d17s16);
+  d1x2s16 = vtrn_s16(d18s16, d19s16);
+  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
+  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+  // do the transform on columns
+  // stage 1
+  d23s16 = vadd_s16(d16s16, d18s16);
+  d24s16 = vsub_s16(d16s16, d18s16);
+
+  q15s32 = vmull_s16(d17s16, d22s16);
+  q1s32 = vmull_s16(d17s16, d20s16);
+  q13s32 = vmull_s16(d23s16, d21s16);
+  q14s32 = vmull_s16(d24s16, d21s16);
+
+  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
+
+  d26s16 = vqrshrn_n_s32(q13s32, 14);
+  d27s16 = vqrshrn_n_s32(q14s32, 14);
+  d29s16 = vqrshrn_n_s32(q15s32, 14);
+  d28s16 = vqrshrn_n_s32(q1s32, 14);
+  q13s16 = vcombine_s16(d26s16, d27s16);
+  q14s16 = vcombine_s16(d28s16, d29s16);
+
+  // stage 2
+  q8s16 = vaddq_s16(q13s16, q14s16);
+  q9s16 = vsubq_s16(q13s16, q14s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 4);
+  q9s16 = vrshrq_n_s16(q9s16, 4);
+
+  d = dest;
+  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+  d += dest_stride;
+  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+  d += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+  d += dest_stride;
+  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+  d = dest;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+  d += dest_stride;
+  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+  return;
+}
--- a/aom_dsp/arm/idct8x8_1_add_neon.asm
+++ b/aom_dsp/arm/idct8x8_1_add_neon.asm
@@ -0,0 +1,91 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+
+
+    EXPORT  |aom_idct8x8_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;                                  int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|aom_idct8x8_1_add_neon| PROC
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 5)
+    add              r0, r0, #16               ; + (1 <<((5) - 1))
+    asr              r0, r0, #5                ; >> 5
+
+    vdup.s16         q0, r0                    ; duplicate a1
+
+    ; load destination data
+    vld1.64          {d2}, [r1], r2
+    vld1.64          {d3}, [r1], r2
+    vld1.64          {d4}, [r1], r2
+    vld1.64          {d5}, [r1], r2
+    vld1.64          {d6}, [r1], r2
+    vld1.64          {d7}, [r1], r2
+    vld1.64          {d16}, [r1], r2
+    vld1.64          {d17}, [r1]
+
+    vaddw.u8         q9, q0, d2                ; dest[x] + a1
+    vaddw.u8         q10, q0, d3               ; dest[x] + a1
+    vaddw.u8         q11, q0, d4               ; dest[x] + a1
+    vaddw.u8         q12, q0, d5               ; dest[x] + a1
+    vqmovun.s16      d2, q9                    ; clip_pixel
+    vqmovun.s16      d3, q10                   ; clip_pixel
+    vqmovun.s16      d30, q11                  ; clip_pixel
+    vqmovun.s16      d31, q12                  ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    vaddw.u8         q9, q0, d6                 ; dest[x] + a1
+    vaddw.u8         q10, q0, d7                ; dest[x] + a1
+    vaddw.u8         q11, q0, d16               ; dest[x] + a1
+    vaddw.u8         q12, q0, d17               ; dest[x] + a1
+    vqmovun.s16      d2, q9                     ; clip_pixel
+    vqmovun.s16      d3, q10                    ; clip_pixel
+    vqmovun.s16      d30, q11                   ; clip_pixel
+    vqmovun.s16      d31, q12                   ; clip_pixel
+    vst1.64          {d2}, [r12], r2
+    vst1.64          {d3}, [r12], r2
+    vst1.64          {d30}, [r12], r2
+    vst1.64          {d31}, [r12], r2
+
+    bx               lr
+    ENDP             ; |aom_idct8x8_1_add_neon|
+
+    END
--- a/aom_dsp/arm/idct8x8_1_add_neon.c
+++ b/aom_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/inv_txfm.h"
+#include "aom_ports/mem.h"
+
+void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8x8_t d2u8, d3u8, d30u8, d31u8;
+  uint64x1_t d2u64, d3u64, d4u64, d5u64;
+  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+  int16x8_t q0s16;
+  uint8_t *d1, *d2;
+  int16_t i, a1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+
+  q0s16 = vdupq_n_s16(a1);
+  q0u16 = vreinterpretq_u16_s16(q0s16);
+
+  d1 = d2 = dest;
+  for (i = 0; i < 2; i++) {
+    d2u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d4u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+    d5u64 = vld1_u64((const uint64_t *)d1);
+    d1 += dest_stride;
+
+    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+    d2 += dest_stride;
+  }
+  return;
+}
--- a/aom_dsp/arm/idct8x8_add_neon.asm
+++ b/aom_dsp/arm/idct8x8_add_neon.asm
@@ -0,0 +1,522 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+    EXPORT  |aom_idct8x8_64_add_neon|
+    EXPORT  |aom_idct8x8_12_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
+    ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
+    ; This macro will touch q0-q7 registers and use them as buffer during
+    ; calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14              ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14              ; >> 14
+    vqrshrn.s32     d23, q15, #14              ; >> 14
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14              ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|aom_idct8x8_64_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    IDCT8x8_1D
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |aom_idct8x8_64_add_neon|
+
+;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride)
+
+|aom_idct8x8_12_add_neon| PROC
+    push            {r4-r9}
+    vpush           {d8-d15}
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+
+    ; First transform rows
+    ; stage 1
+    ; The following instructions use vqrdmulh to do the
+    ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
+    ; multiply and shift the result by 16 bits instead of 14 bits. So we need
+    ; to double the constants before multiplying to compensate this.
+    mov             r12, r3, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_28_64*2
+    mov             r12, r4, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_4_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_28_64)
+    vqrdmulh.s16    q4, q9, q0
+
+    mov             r12, r6, lsl #1
+    rsb             r12, #0
+    vdup.16         q0, r12                   ; duplicate -cospi_20_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_4_64)
+    vqrdmulh.s16    q7, q9, q1
+
+    mov             r12, r5, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_12_64*2
+
+    ; dct_const_round_shift(- input[3] * cospi_20_64)
+    vqrdmulh.s16    q5, q11, q0
+
+    mov             r12, r7, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_16_64*2
+
+    ; dct_const_round_shift(input[3] * cospi_12_64)
+    vqrdmulh.s16    q6, q11, q1
+
+    ; stage 2 & stage 3 - even half
+    mov             r12, r8, lsl #1
+    vdup.16         q1, r12                   ; duplicate cospi_24_64*2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrdmulh.s16    q9, q8, q0
+
+    mov             r12, r9, lsl #1
+    vdup.16         q0, r12                   ; duplicate cospi_8_64*2
+
+    ; dct_const_round_shift(input[1] * cospi_24_64)
+    vqrdmulh.s16    q13, q10, q1
+
+    ; dct_const_round_shift(input[1] * cospi_8_64)
+    vqrdmulh.s16    q15, q10, q0
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q9, q13               ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q9, q13               ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14              ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+
+    ; Transpose the matrix
+    TRANSPOSE8X8
+
+    ; Then transform columns
+    IDCT8x8_1D
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+
+    vpop            {d8-d15}
+    pop             {r4-r9}
+    bx              lr
+    ENDP  ; |aom_idct8x8_12_add_neon|
+
+    END
--- a/aom_dsp/arm/idct8x8_add_neon.c
+++ b/aom_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "aom_dsp/txfm_common.h"
+
+static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
+                                int16x8_t *q10s16, int16x8_t *q11s16,
+                                int16x8_t *q12s16, int16x8_t *q13s16,
+                                int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
+  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
+  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+  *q12s16 = vcombine_s16(d17s16, d25s16);
+  *q13s16 = vcombine_s16(d19s16, d27s16);
+  *q14s16 = vcombine_s16(d21s16, d29s16);
+  *q15s16 = vcombine_s16(d23s16, d31s16);
+
+  q0x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
+  q1x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
+  q2x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
+  q3x2s32 =
+      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
+
+  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+  *q8s16 = q0x2s16.val[0];
+  *q9s16 = q0x2s16.val[1];
+  *q10s16 = q1x2s16.val[0];
+  *q11s16 = q1x2s16.val[1];
+  *q12s16 = q2x2s16.val[0];
+  *q13s16 = q2x2s16.val[1];
+  *q14s16 = q3x2s16.val[0];
+  *q15s16 = q3x2s16.val[1];
+  return;
+}
+
+static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
+                              int16x8_t *q10s16, int16x8_t *q11s16,
+                              int16x8_t *q12s16, int16x8_t *q13s16,
+                              int16x8_t *q14s16, int16x8_t *q15s16) {
+  int16x4_t d0s16, d1s16, d2s16, d3s16;
+  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
+  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
+  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
+
+  d16s16 = vget_low_s16(*q8s16);
+  d17s16 = vget_high_s16(*q8s16);
+  d18s16 = vget_low_s16(*q9s16);
+  d19s16 = vget_high_s16(*q9s16);
+  d20s16 = vget_low_s16(*q10s16);
+  d21s16 = vget_high_s16(*q10s16);
+  d22s16 = vget_low_s16(*q11s16);
+  d23s16 = vget_high_s16(*q11s16);
+  d24s16 = vget_low_s16(*q12s16);
+  d25s16 = vget_high_s16(*q12s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+  d30s16 = vget_low_s16(*q15s16);
+  d31s16 = vget_high_s16(*q15s16);
+
+  q2s32 = vmull_s16(d18s16, d0s16);
+  q3s32 = vmull_s16(d19s16, d0s16);
+  q5s32 = vmull_s16(d26s16, d2s16);
+  q6s32 = vmull_s16(d27s16, d2s16);
+
+  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+  d8s16 = vqrshrn_n_s32(q2s32, 14);
+  d9s16 = vqrshrn_n_s32(q3s32, 14);
+  d10s16 = vqrshrn_n_s32(q5s32, 14);
+  d11s16 = vqrshrn_n_s32(q6s32, 14);
+  q4s16 = vcombine_s16(d8s16, d9s16);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+
+  q2s32 = vmull_s16(d18s16, d1s16);
+  q3s32 = vmull_s16(d19s16, d1s16);
+  q9s32 = vmull_s16(d26s16, d3s16);
+  q13s32 = vmull_s16(d27s16, d3s16);
+
+  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+  d14s16 = vqrshrn_n_s32(q2s32, 14);
+  d15s16 = vqrshrn_n_s32(q3s32, 14);
+  d12s16 = vqrshrn_n_s32(q9s32, 14);
+  d13s16 = vqrshrn_n_s32(q13s32, 14);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+  q7s16 = vcombine_s16(d14s16, d15s16);
+
+  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+  q2s32 = vmull_s16(d16s16, d0s16);
+  q3s32 = vmull_s16(d17s16, d0s16);
+  q13s32 = vmull_s16(d16s16, d0s16);
+  q15s32 = vmull_s16(d17s16, d0s16);
+
+  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
+  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
+
+  d18s16 = vqrshrn_n_s32(q2s32, 14);
+  d19s16 = vqrshrn_n_s32(q3s32, 14);
+  d22s16 = vqrshrn_n_s32(q13s32, 14);
+  d23s16 = vqrshrn_n_s32(q15s32, 14);
+  *q9s16 = vcombine_s16(d18s16, d19s16);
+  *q11s16 = vcombine_s16(d22s16, d23s16);
+
+  q2s32 = vmull_s16(d20s16, d0s16);
+  q3s32 = vmull_s16(d21s16, d0s16);
+  q8s32 = vmull_s16(d20s16, d1s16);
+  q12s32 = vmull_s16(d21s16, d1s16);
+
+  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+  d26s16 = vqrshrn_n_s32(q2s32, 14);
+  d27s16 = vqrshrn_n_s32(q3s32, 14);
+  d30s16 = vqrshrn_n_s32(q8s32, 14);
+  d31s16 = vqrshrn_n_s32(q12s32, 14);
+  *q13s16 = vcombine_s16(d26s16, d27s16);
+  *q15s16 = vcombine_s16(d30s16, d31s16);
+
+  q0s16 = vaddq_s16(*q9s16, *q15s16);
+  q1s16 = vaddq_s16(*q11s16, *q13s16);
+  q2s16 = vsubq_s16(*q11s16, *q13s16);
+  q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+  *q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  *q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(*q13s16);
+  d27s16 = vget_high_s16(*q13s16);
+  d28s16 = vget_low_s16(*q14s16);
+  d29s16 = vget_high_s16(*q14s16);
+
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
+
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vqrshrn_n_s32(q9s32, 14);
+  d11s16 = vqrshrn_n_s32(q10s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  *q8s16 = vaddq_s16(q0s16, q7s16);
+  *q9s16 = vaddq_s16(q1s16, q6s16);
+  *q10s16 = vaddq_s16(q2s16, q5s16);
+  *q11s16 = vaddq_s16(q3s16, q4s16);
+  *q12s16 = vsubq_s16(q3s16, q4s16);
+  *q13s16 = vsubq_s16(q2s16, q5s16);
+  *q14s16 = vsubq_s16(q1s16, q6s16);
+  *q15s16 = vsubq_s16(q0s16, q7s16);
+  return;
+}
+
+void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 16);
+  q11s16 = vld1q_s16(input + 24);
+  q12s16 = vld1q_s16(input + 32);
+  q13s16 = vld1q_s16(input + 40);
+  q14s16 = vld1q_s16(input + 48);
+  q15s16 = vld1q_s16(input + 56);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  d1 = d2 = dest;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+
+  q8s16 = q12s16;
+  q9s16 = q13s16;
+  q10s16 = q14s16;
+  q11s16 = q15s16;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+  return;
+}
+
+void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
+  uint8_t *d1, *d2;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8;
+  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+  int16x4_t d26s16, d27s16, d28s16, d29s16;
+  uint64x1_t d0u64, d1u64, d2u64, d3u64;
+  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  uint16x8_t q8u16, q9u16, q10u16, q11u16;
+  int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+  q8s16 = vld1q_s16(input);
+  q9s16 = vld1q_s16(input + 8);
+  q10s16 = vld1q_s16(input + 16);
+  q11s16 = vld1q_s16(input + 24);
+  q12s16 = vld1q_s16(input + 32);
+  q13s16 = vld1q_s16(input + 40);
+  q14s16 = vld1q_s16(input + 48);
+  q15s16 = vld1q_s16(input + 56);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  // First transform rows
+  // stage 1
+  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
+  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
+
+  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+  q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
+
+  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+  q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
+
+  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+  q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
+
+  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+  // stage 2 & stage 3 - even half
+  q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
+
+  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+  q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
+
+  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+  // stage 3 -odd half
+  q0s16 = vaddq_s16(q9s16, q15s16);
+  q1s16 = vaddq_s16(q9s16, q13s16);
+  q2s16 = vsubq_s16(q9s16, q13s16);
+  q3s16 = vsubq_s16(q9s16, q15s16);
+
+  // stage 2 - odd half
+  q13s16 = vsubq_s16(q4s16, q5s16);
+  q4s16 = vaddq_s16(q4s16, q5s16);
+  q14s16 = vsubq_s16(q7s16, q6s16);
+  q7s16 = vaddq_s16(q7s16, q6s16);
+  d26s16 = vget_low_s16(q13s16);
+  d27s16 = vget_high_s16(q13s16);
+  d28s16 = vget_low_s16(q14s16);
+  d29s16 = vget_high_s16(q14s16);
+
+  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
+  q9s32 = vmull_s16(d28s16, d16s16);
+  q10s32 = vmull_s16(d29s16, d16s16);
+  q11s32 = vmull_s16(d28s16, d16s16);
+  q12s32 = vmull_s16(d29s16, d16s16);
+
+  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+  d10s16 = vqrshrn_n_s32(q9s32, 14);
+  d11s16 = vqrshrn_n_s32(q10s32, 14);
+  d12s16 = vqrshrn_n_s32(q11s32, 14);
+  d13s16 = vqrshrn_n_s32(q12s32, 14);
+  q5s16 = vcombine_s16(d10s16, d11s16);
+  q6s16 = vcombine_s16(d12s16, d13s16);
+
+  // stage 4
+  q8s16 = vaddq_s16(q0s16, q7s16);
+  q9s16 = vaddq_s16(q1s16, q6s16);
+  q10s16 = vaddq_s16(q2s16, q5s16);
+  q11s16 = vaddq_s16(q3s16, q4s16);
+  q12s16 = vsubq_s16(q3s16, q4s16);
+  q13s16 = vsubq_s16(q2s16, q5s16);
+  q14s16 = vsubq_s16(q1s16, q6s16);
+  q15s16 = vsubq_s16(q0s16, q7s16);
+
+  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+               &q15s16);
+
+  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
+             &q15s16);
+
+  q8s16 = vrshrq_n_s16(q8s16, 5);
+  q9s16 = vrshrq_n_s16(q9s16, 5);
+  q10s16 = vrshrq_n_s16(q10s16, 5);
+  q11s16 = vrshrq_n_s16(q11s16, 5);
+  q12s16 = vrshrq_n_s16(q12s16, 5);
+  q13s16 = vrshrq_n_s16(q13s16, 5);
+  q14s16 = vrshrq_n_s16(q14s16, 5);
+  q15s16 = vrshrq_n_s16(q15s16, 5);
+
+  d1 = d2 = dest;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+
+  q8s16 = q12s16;
+  q9s16 = q13s16;
+  q10s16 = q14s16;
+  q11s16 = q15s16;
+
+  d0u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d1u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d2u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+  d3u64 = vld1_u64((uint64_t *)d1);
+  d1 += dest_stride;
+
+  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
+  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
+  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
+  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
+
+  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+  d2 += dest_stride;
+  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+  d2 += dest_stride;
+  return;
+}
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_left = vcombine_u16(p2, p2);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A = vld1q_u8(above);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    const uint8x16_t L = vld1q_u8(left);  // left row
+    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_left = vcombine_u16(p3, p3);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+    }
+  }
+}
+
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void aom_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  (void)left;
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  dst[3 * stride + 3] = above[7];
+}
+
+void aom_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
+  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
+  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
+  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
+  const uint8x8_t A0 = vld1_u8(above);  // top row
+  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
+  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t avg1 = vhadd_u8(A0, A2);
+  uint8x8_t row = vrhadd_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 7; ++i) {
+    vst1_u8(dst + i * stride, row);
+    row = vtbl1_u8(row, sh_12345677);
+  }
+  vst1_u8(dst + i * stride, row);
+}
+
+void aom_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t A0 = vld1q_u8(above);  // top row
+  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
+  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
+  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
+  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  int i;
+  (void)left;
+  for (i = 0; i < 15; ++i) {
+    vst1q_u8(dst + i * stride, row);
+    row = vextq_u8(row, above_right, 1);
+  }
+  vst1q_u8(dst + i * stride, row);
+}
+
+// -----------------------------------------------------------------------------
+
+void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+#if !HAVE_NEON_ASM
+
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint32x2_t d0u32 = vdup_n_u32(0);
+  (void)left;
+
+  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride)
+    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  (void)left;
+
+  d0u8 = vld1_u8(above);
+  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+}
+
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+}
+
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  q1u8 = vld1q_u8(above + 16);
+  for (i = 0; i < 32; i++, dst += stride) {
+    vst1q_u8(dst, q0u8);
+    vst1q_u8(dst + 16, q1u8);
+  }
+}
+
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d1u32 = vdup_n_u32(0);
+  (void)above;
+
+  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint64x1_t d1u64 = vdup_n_u64(0);
+  (void)above;
+
+  d1u64 = vld1_u64((const uint64_t *)left);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+  vst1_u8(dst, d0u8);
+}
+
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  q1u8 = vld1q_u8(left);
+  d2u8 = vget_low_u8(q1u8);
+  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+    q0u8 = vdupq_lane_u8(d2u8, 0);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 1);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 2);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 3);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 4);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 5);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 6);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 7);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+  }
+}
+
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  for (k = 0; k < 2; k++, left += 16) {
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+      q0u8 = vdupq_lane_u8(d2u8, 0);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 1);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 2);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 3);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 4);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 5);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 6);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 7);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+    }
+  }
+}
+
+void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint16x8_t q1u16, q3u16;
+  int16x8_t q1s16;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d2u32 = vdup_n_u32(0);
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+  for (i = 0; i < 4; i++, dst += stride) {
+    q1u16 = vdupq_n_u16((uint16_t)left[i]);
+    q1s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
+    d0u8 = vqmovun_s16(q1s16);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  }
+}
+
+void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint16x8_t q0u16, q3u16, q10u16;
+  int16x8_t q0s16;
+  uint16x4_t d20u16;
+  uint8x8_t d0u8, d2u8, d30u8;
+
+  d0u8 = vld1_dup_u8(above - 1);
+  d30u8 = vld1_u8(left);
+  d2u8 = vld1_u8(above);
+  q10u16 = vmovl_u8(d30u8);
+  q3u16 = vsubl_u8(d2u8, d0u8);
+  d20u16 = vget_low_u16(q10u16);
+  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+    q0u16 = vdupq_lane_u16(d20u16, 0);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 1);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 2);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+    q0u16 = vdupq_lane_u16(d20u16, 3);
+    q0s16 =
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += stride;
+  }
+}
+
+void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+  uint8x16_t q0u8, q1u8;
+  int16x8_t q0s16, q1s16, q8s16, q11s16;
+  uint16x4_t d20u16;
+  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  for (k = 0; k < 2; k++, left += 8) {
+    d18u8 = vld1_u8(left);
+    q10u16 = vmovl_u8(d18u8);
+    d20u16 = vget_low_u16(q10u16);
+    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+      q0u16 = vdupq_lane_u16(d20u16, 0);
+      q8u16 = vdupq_lane_u16(d20u16, 1);
+      q1s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+      q0s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+      q11s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+      q8s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d20u16, 2);
+      q8u16 = vdupq_lane_u16(d20u16, 3);
+      q1s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
+      q0s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
+      q11s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
+      q8s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += stride;
+    }
+  }
+}
+
+void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+  uint8x16_t q0u8, q1u8, q2u8;
+  int16x8_t q12s16, q13s16, q14s16, q15s16;
+  uint16x4_t d6u16;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+  q0u8 = vld1q_dup_u8(above - 1);
+  q1u8 = vld1q_u8(above);
+  q2u8 = vld1q_u8(above + 16);
+  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+  for (k = 0; k < 4; k++, left += 8) {
+    d26u8 = vld1_u8(left);
+    q3u16 = vmovl_u8(d26u8);
+    d6u16 = vget_low_u16(q3u16);
+    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+      q0u16 = vdupq_lane_u16(d6u16, 0);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 1);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 2);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 3);
+      q12s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
+      q13s16 =
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += stride;
+    }
+  }
+}
+#endif  // !HAVE_NEON_ASM
--- a/aom_dsp/arm/intrapred_neon_asm.asm
+++ b/aom_dsp/arm/intrapred_neon_asm.asm
@@ -1,32 +1,35 @@
 ;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;

-    EXPORT  |vpx_v_predictor_4x4_neon|
-    EXPORT  |vpx_v_predictor_8x8_neon|
-    EXPORT  |vpx_v_predictor_16x16_neon|
-    EXPORT  |vpx_v_predictor_32x32_neon|
-    EXPORT  |vpx_h_predictor_4x4_neon|
-    EXPORT  |vpx_h_predictor_8x8_neon|
-    EXPORT  |vpx_h_predictor_16x16_neon|
-    EXPORT  |vpx_h_predictor_32x32_neon|
-    EXPORT  |vpx_tm_predictor_4x4_neon|
-    EXPORT  |vpx_tm_predictor_8x8_neon|
-    EXPORT  |vpx_tm_predictor_16x16_neon|
-    EXPORT  |vpx_tm_predictor_32x32_neon|
+;
+
+    EXPORT  |aom_v_predictor_4x4_neon|
+    EXPORT  |aom_v_predictor_8x8_neon|
+    EXPORT  |aom_v_predictor_16x16_neon|
+    EXPORT  |aom_v_predictor_32x32_neon|
+    EXPORT  |aom_h_predictor_4x4_neon|
+    EXPORT  |aom_h_predictor_8x8_neon|
+    EXPORT  |aom_h_predictor_16x16_neon|
+    EXPORT  |aom_h_predictor_32x32_neon|
+    EXPORT  |aom_tm_predictor_4x4_neon|
+    EXPORT  |aom_tm_predictor_8x8_neon|
+    EXPORT  |aom_tm_predictor_16x16_neon|
+    EXPORT  |aom_tm_predictor_32x32_neon|
    ARM
    REQUIRE8
    PRESERVE8

    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -34,16 +37,16 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_v_predictor_4x4_neon| PROC
+|aom_v_predictor_4x4_neon| PROC
    vld1.32             {d0[0]}, [r2]
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d0[0]}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_v_predictor_4x4_neon|
+    ENDP                ; |aom_v_predictor_4x4_neon|

-;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -51,7 +54,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_v_predictor_8x8_neon| PROC
+|aom_v_predictor_8x8_neon| PROC
    vld1.8              {d0}, [r2]
    vst1.8              {d0}, [r0], r1
    vst1.8              {d0}, [r0], r1
@@ -62,9 +65,9 @@
    vst1.8              {d0}, [r0], r1
    vst1.8              {d0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_v_predictor_8x8_neon|
+    ENDP                ; |aom_v_predictor_8x8_neon|

-;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -72,7 +75,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_v_predictor_16x16_neon| PROC
+|aom_v_predictor_16x16_neon| PROC
    vld1.8              {q0}, [r2]
    vst1.8              {q0}, [r0], r1
    vst1.8              {q0}, [r0], r1
@@ -91,9 +94,9 @@
    vst1.8              {q0}, [r0], r1
    vst1.8              {q0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_v_predictor_16x16_neon|
+    ENDP                ; |aom_v_predictor_16x16_neon|

-;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -101,7 +104,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_v_predictor_32x32_neon| PROC
+|aom_v_predictor_32x32_neon| PROC
    vld1.8              {q0, q1}, [r2]
    mov                 r2, #2
 loop_v
@@ -124,9 +127,9 @@ loop_v
    subs                r2, r2, #1
    bgt                 loop_v
    bx                  lr
-    ENDP                ; |vpx_v_predictor_32x32_neon|
+    ENDP                ; |aom_v_predictor_32x32_neon|

-;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -134,7 +137,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_h_predictor_4x4_neon| PROC
+|aom_h_predictor_4x4_neon| PROC
    vld1.32             {d1[0]}, [r3]
    vdup.8              d0, d1[0]
    vst1.32             {d0[0]}, [r0], r1
@@ -145,9 +148,9 @@ loop_v
    vdup.8              d0, d1[3]
    vst1.32             {d0[0]}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_h_predictor_4x4_neon|
+    ENDP                ; |aom_h_predictor_4x4_neon|

-;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -155,7 +158,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_h_predictor_8x8_neon| PROC
+|aom_h_predictor_8x8_neon| PROC
    vld1.64             {d1}, [r3]
    vdup.8              d0, d1[0]
    vst1.64             {d0}, [r0], r1
@@ -174,9 +177,9 @@ loop_v
    vdup.8              d0, d1[7]
    vst1.64             {d0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_h_predictor_8x8_neon|
+    ENDP                ; |aom_h_predictor_8x8_neon|

-;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -184,7 +187,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_h_predictor_16x16_neon| PROC
+|aom_h_predictor_16x16_neon| PROC
    vld1.8              {q1}, [r3]
    vdup.8              q0, d2[0]
    vst1.8              {q0}, [r0], r1
@@ -219,9 +222,9 @@ loop_v
    vdup.8              q0, d3[7]
    vst1.8              {q0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_h_predictor_16x16_neon|
+    ENDP                ; |aom_h_predictor_16x16_neon|

-;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -229,7 +232,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_h_predictor_32x32_neon| PROC
+|aom_h_predictor_32x32_neon| PROC
    sub                 r1, r1, #16
    mov                 r2, #2
 loop_h
@@ -285,9 +288,9 @@ loop_h
    subs                r2, r2, #1
    bgt                 loop_h
    bx                  lr
-    ENDP                ; |vpx_h_predictor_32x32_neon|
+    ENDP                ; |aom_h_predictor_32x32_neon|

-;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -295,7 +298,7 @@ loop_h
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_tm_predictor_4x4_neon| PROC
+|aom_tm_predictor_4x4_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.u8             {d0[]}, [r12]
@@ -331,9 +334,9 @@ loop_h
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d1[0]}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_tm_predictor_4x4_neon|
+    ENDP                ; |aom_tm_predictor_4x4_neon|

-;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -341,7 +344,7 @@ loop_h
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_tm_predictor_8x8_neon| PROC
+|aom_tm_predictor_8x8_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.8              {d0[]}, [r12]
@@ -403,9 +406,9 @@ loop_h
    vst1.64             {d3}, [r0], r1

    bx                  lr
-    ENDP                ; |vpx_tm_predictor_8x8_neon|
+    ENDP                ; |aom_tm_predictor_8x8_neon|

-;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -413,7 +416,7 @@ loop_h
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_tm_predictor_16x16_neon| PROC
+|aom_tm_predictor_16x16_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.8              {d0[]}, [r12]
@@ -496,9 +499,9 @@ loop_16x16_neon
    bgt                 loop_16x16_neon

    bx                  lr
-    ENDP                ; |vpx_tm_predictor_16x16_neon|
+    ENDP                ; |aom_tm_predictor_16x16_neon|

-;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                  const uint8_t *above,
 ;                                  const uint8_t *left)
 ; r0  uint8_t *dst
@@ -506,7 +509,7 @@ loop_16x16_neon
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left

-|vpx_tm_predictor_32x32_neon| PROC
+|aom_tm_predictor_32x32_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.8              {d0[]}, [r12]
@@ -625,6 +628,6 @@ loop_32x32_neon
    bgt                 loop_32x32_neon

    bx                  lr
-    ENDP                ; |vpx_tm_predictor_32x32_neon|
+    ENDP                ; |aom_tm_predictor_32x32_neon|

    END
--- a/aom_dsp/arm/loopfilter_16_neon.asm
+++ b/aom_dsp/arm/loopfilter_16_neon.asm
@@ -0,0 +1,202 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+    EXPORT  |aom_lpf_horizontal_4_dual_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;                                    const uint8_t *blimit0,
+;                                    const uint8_t *limit0,
+;                                    const uint8_t *thresh0,
+;                                    const uint8_t *blimit1,
+;                                    const uint8_t *limit1,
+;                                    const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|aom_lpf_horizontal_4_dual_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    add         r1, r1, r1                 ; double pitch
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    add         r3, r2, r1, lsr #1         ; s[-3 * p]
+
+    vld1.u8     {q3}, [r2@64], r1          ; p3
+    vld1.u8     {q4}, [r3@64], r1          ; p2
+    vld1.u8     {q5}, [r2@64], r1          ; p1
+    vld1.u8     {q6}, [r3@64], r1          ; p0
+    vld1.u8     {q7}, [r2@64], r1          ; q0
+    vld1.u8     {q8}, [r3@64], r1          ; q1
+    vld1.u8     {q9}, [r2@64]              ; q2
+    vld1.u8     {q10}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          aom_loop_filter_neon_16
+
+    vst1.u8     {q5}, [r2@64], r1          ; store op1
+    vst1.u8     {q6}, [r3@64], r1          ; store op0
+    vst1.u8     {q7}, [r2@64], r1          ; store oq0
+    vst1.u8     {q8}, [r3@64], r1          ; store oq1
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |aom_lpf_horizontal_4_dual_neon|
+
+; void aom_loop_filter_neon_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0    blimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+;
+; Outputs:
+; q5    op1
+; q6    op0
+; q7    oq0
+; q8    oq1
+|aom_loop_filter_neon_16| PROC
+
+    ; filter_mask
+    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
+
+    vmov.u8     q10, #0x80
+
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    veor        q7, q7, q10                 ; qs0
+
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
+
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u16    q4, #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; a > blimit
+
+    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; hev
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; filter &= hev
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
+
+    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; filter &= mask
+
+    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
+    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
+    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
+    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
+
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
+
+    vbic        q1, q1, q14                 ; filter &= ~hev
+
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
+
+    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q5, q13, q10                ; *op1 = u^0x80
+    veor        q8, q12, q10                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |aom_loop_filter_neon_16|
+
+    END
--- a/aom_dsp/arm/loopfilter_16_neon.c
+++ b/aom_dsp/arm/loopfilter_16_neon.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
+                                       uint8x16_t qlimit,   // limit
+                                       uint8x16_t qthresh,  // thresh
+                                       uint8x16_t q3,       // p3
+                                       uint8x16_t q4,       // p2
+                                       uint8x16_t q5,       // p1
+                                       uint8x16_t q6,       // p0
+                                       uint8x16_t q7,       // q0
+                                       uint8x16_t q8,       // q1
+                                       uint8x16_t q9,       // q2
+                                       uint8x16_t q10,      // q3
+                                       uint8x16_t *q5r,     // p1
+                                       uint8x16_t *q6r,     // p0
+                                       uint8x16_t *q7r,     // q0
+                                       uint8x16_t *q8r) {   // q1
+  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+  int16x8_t q2s16, q11s16;
+  uint16x8_t q4u16;
+  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+  int8x8_t d2s8, d3s8;
+
+  q11u8 = vabdq_u8(q3, q4);
+  q12u8 = vabdq_u8(q4, q5);
+  q13u8 = vabdq_u8(q5, q6);
+  q14u8 = vabdq_u8(q8, q7);
+  q3 = vabdq_u8(q9, q8);
+  q4 = vabdq_u8(q10, q9);
+
+  q11u8 = vmaxq_u8(q11u8, q12u8);
+  q12u8 = vmaxq_u8(q13u8, q14u8);
+  q3 = vmaxq_u8(q3, q4);
+  q15u8 = vmaxq_u8(q11u8, q12u8);
+
+  q9 = vabdq_u8(q6, q7);
+
+  // aom_hevmask
+  q13u8 = vcgtq_u8(q13u8, qthresh);
+  q14u8 = vcgtq_u8(q14u8, qthresh);
+  q15u8 = vmaxq_u8(q15u8, q3);
+
+  q2u8 = vabdq_u8(q5, q8);
+  q9 = vqaddq_u8(q9, q9);
+
+  q15u8 = vcgeq_u8(qlimit, q15u8);
+
+  // aom_filter() function
+  // convert to signed
+  q10 = vdupq_n_u8(0x80);
+  q8 = veorq_u8(q8, q10);
+  q7 = veorq_u8(q7, q10);
+  q6 = veorq_u8(q6, q10);
+  q5 = veorq_u8(q5, q10);
+
+  q2u8 = vshrq_n_u8(q2u8, 1);
+  q9 = vqaddq_u8(q9, q2u8);
+
+  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                   vget_low_s8(vreinterpretq_s8_u8(q6)));
+  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                    vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+  q9 = vcgeq_u8(qblimit, q9);
+
+  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
+
+  q14u8 = vorrq_u8(q13u8, q14u8);
+
+  q4u16 = vdupq_n_u16(3);
+  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+  q15u8 = vandq_u8(q15u8, q9);
+
+  q1s8 = vreinterpretq_s8_u8(q1u8);
+  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+  q4 = vdupq_n_u8(3);
+  q9 = vdupq_n_u8(4);
+  // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0))
+  d2s8 = vqmovn_s16(q2s16);
+  d3s8 = vqmovn_s16(q11s16);
+  q1s8 = vcombine_s8(d2s8, d3s8);
+  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+  q1s8 = vreinterpretq_s8_u8(q1u8);
+
+  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+  q2s8 = vshrq_n_s8(q2s8, 3);
+  q1s8 = vshrq_n_s8(q1s8, 3);
+
+  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+  q1s8 = vrshrq_n_s8(q1s8, 1);
+  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
+  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+  return;
+}
+
+void aom_lpf_horizontal_4_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+  uint8x16_t qblimit, qlimit, qthresh;
+  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
+
+  dblimit0 = vld1_u8(blimit0);
+  dlimit0 = vld1_u8(limit0);
+  dthresh0 = vld1_u8(thresh0);
+  dblimit1 = vld1_u8(blimit1);
+  dlimit1 = vld1_u8(limit1);
+  dthresh1 = vld1_u8(thresh1);
+  qblimit = vcombine_u8(dblimit0, dblimit1);
+  qlimit = vcombine_u8(dlimit0, dlimit1);
+  qthresh = vcombine_u8(dthresh0, dthresh1);
+
+  s -= (p << 2);
+
+  q3u8 = vld1q_u8(s);
+  s += p;
+  q4u8 = vld1q_u8(s);
+  s += p;
+  q5u8 = vld1q_u8(s);
+  s += p;
+  q6u8 = vld1q_u8(s);
+  s += p;
+  q7u8 = vld1q_u8(s);
+  s += p;
+  q8u8 = vld1q_u8(s);
+  s += p;
+  q9u8 = vld1q_u8(s);
+  s += p;
+  q10u8 = vld1q_u8(s);
+
+  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
+                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
+
+  s -= (p * 5);
+  vst1q_u8(s, q5u8);
+  s += p;
+  vst1q_u8(s, q6u8);
+  s += p;
+  vst1q_u8(s, q7u8);
+  s += p;
+  vst1q_u8(s, q8u8);
+  return;
+}
--- a/aom_dsp/arm/loopfilter_4_neon.asm
+++ b/aom_dsp/arm/loopfilter_4_neon.asm
@@ -0,0 +1,252 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+    EXPORT  |aom_lpf_horizontal_4_neon|
+    EXPORT  |aom_lpf_vertical_4_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently aom only works on iterations 8 at a time. The aom loop filter
+; works on 16 iterations at a time.
+;
+; void aom_lpf_horizontal_4_neon(uint8_t *s,
+;                                int p /* pitch */,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|aom_lpf_horizontal_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          aom_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    pop         {pc}
+    ENDP        ; |aom_lpf_horizontal_4_neon|
+
+; Currently aom only works on iterations 8 at a time. The aom loop filter
+; works on 16 iterations at a time.
+;
+; void aom_lpf_vertical_4_neon(uint8_t *s,
+;                              int p /* pitch */,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|aom_lpf_vertical_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          aom_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    pop         {pc}
+    ENDP        ; |aom_lpf_vertical_4_neon|
+
+; void aom_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|aom_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |aom_loop_filter_neon|
+
+    END
--- a/aom_dsp/arm/loopfilter_4_neon.c
+++ b/aom_dsp/arm/loopfilter_4_neon.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
+                                    uint8x8_t dlimit,    // limit
+                                    uint8x8_t dthresh,   // thresh
+                                    uint8x8_t d3u8,      // p3
+                                    uint8x8_t d4u8,      // p2
+                                    uint8x8_t d5u8,      // p1
+                                    uint8x8_t d6u8,      // p0
+                                    uint8x8_t d7u8,      // q0
+                                    uint8x8_t d16u8,     // q1
+                                    uint8x8_t d17u8,     // q2
+                                    uint8x8_t d18u8,     // q3
+                                    uint8x8_t *d4ru8,    // p1
+                                    uint8x8_t *d5ru8,    // p0
+                                    uint8x8_t *d6ru8,    // q0
+                                    uint8x8_t *d7ru8) {  // q1
+  uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+  int16x8_t q12s16;
+  int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+  d19u8 = vabd_u8(d3u8, d4u8);
+  d20u8 = vabd_u8(d4u8, d5u8);
+  d21u8 = vabd_u8(d5u8, d6u8);
+  d22u8 = vabd_u8(d16u8, d7u8);
+  d3u8 = vabd_u8(d17u8, d16u8);
+  d4u8 = vabd_u8(d18u8, d17u8);
+
+  d19u8 = vmax_u8(d19u8, d20u8);
+  d20u8 = vmax_u8(d21u8, d22u8);
+  d3u8 = vmax_u8(d3u8, d4u8);
+  d23u8 = vmax_u8(d19u8, d20u8);
+
+  d17u8 = vabd_u8(d6u8, d7u8);
+
+  d21u8 = vcgt_u8(d21u8, dthresh);
+  d22u8 = vcgt_u8(d22u8, dthresh);
+  d23u8 = vmax_u8(d23u8, d3u8);
+
+  d28u8 = vabd_u8(d5u8, d16u8);
+  d17u8 = vqadd_u8(d17u8, d17u8);
+
+  d23u8 = vcge_u8(dlimit, d23u8);
+
+  d18u8 = vdup_n_u8(0x80);
+  d5u8 = veor_u8(d5u8, d18u8);
+  d6u8 = veor_u8(d6u8, d18u8);
+  d7u8 = veor_u8(d7u8, d18u8);
+  d16u8 = veor_u8(d16u8, d18u8);
+
+  d28u8 = vshr_n_u8(d28u8, 1);
+  d17u8 = vqadd_u8(d17u8, d28u8);
+
+  d19u8 = vdup_n_u8(3);
+
+  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
+
+  d17u8 = vcge_u8(dblimit, d17u8);
+
+  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
+
+  d22u8 = vorr_u8(d21u8, d22u8);
+
+  q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+  d23u8 = vand_u8(d23u8, d17u8);
+
+  q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+  d17u8 = vdup_n_u8(4);
+
+  d27s8 = vqmovn_s16(q12s16);
+  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+  d27s8 = vreinterpret_s8_u8(d27u8);
+
+  d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+  d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+  d28s8 = vshr_n_s8(d28s8, 3);
+  d27s8 = vshr_n_s8(d27s8, 3);
+
+  d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+  d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+  d27s8 = vrshr_n_s8(d27s8, 1);
+  d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+  d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+  d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+  *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+  *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+  *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+  *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+  return;
+}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s, *psrc;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  psrc = src - (pitch << 2);
+  for (i = 0; i < 1; i++) {
+    s = psrc + i * 8;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+    s -= (pitch * 5);
+    vst1_u8(s, d4u8);
+    s += pitch;
+    vst1_u8(s, d5u8);
+    s += pitch;
+    vst1_u8(s, d6u8);
+    s += pitch;
+    vst1_u8(s, d7u8);
+  }
+  return;
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  int i, pitch8;
+  uint8_t *s;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+  uint8x8x4_t d4Result;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  pitch8 = pitch * 8;
+  for (i = 0; i < 1; i++, src += pitch8) {
+    s = src - (i + 1) * 4;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                      vreinterpret_u16_u32(d2tmp2.val[0]));
+    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                      vreinterpret_u16_u32(d2tmp3.val[0]));
+    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                      vreinterpret_u16_u32(d2tmp2.val[1]));
+    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                      vreinterpret_u16_u32(d2tmp3.val[1]));
+
+    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                     vreinterpret_u8_u16(d2tmp5.val[0]));
+    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                     vreinterpret_u8_u16(d2tmp5.val[1]));
+    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                      vreinterpret_u8_u16(d2tmp7.val[0]));
+    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                      vreinterpret_u8_u16(d2tmp7.val[1]));
+
+    d3u8 = d2tmp8.val[0];
+    d4u8 = d2tmp8.val[1];
+    d5u8 = d2tmp9.val[0];
+    d6u8 = d2tmp9.val[1];
+    d7u8 = d2tmp10.val[0];
+    d16u8 = d2tmp10.val[1];
+    d17u8 = d2tmp11.val[0];
+    d18u8 = d2tmp11.val[1];
+
+    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
+
+    d4Result.val[0] = d4u8;
+    d4Result.val[1] = d5u8;
+    d4Result.val[2] = d6u8;
+    d4Result.val[3] = d7u8;
+
+    src -= 2;
+    vst4_lane_u8(src, d4Result, 0);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 1);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 2);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 3);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 4);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 5);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 6);
+    src += pitch;
+    vst4_lane_u8(src, d4Result, 7);
+  }
+  return;
+}
--- a/aom_dsp/arm/loopfilter_8_neon.asm
+++ b/aom_dsp/arm/loopfilter_8_neon.asm
@@ -1,25 +1,26 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;

-    EXPORT  |vpx_lpf_horizontal_8_neon|
-    EXPORT  |vpx_lpf_horizontal_8_dual_neon|
-    EXPORT  |vpx_lpf_vertical_8_neon|
-    EXPORT  |vpx_lpf_vertical_8_dual_neon|
+;
+
+    EXPORT  |aom_lpf_horizontal_8_neon|
+    EXPORT  |aom_lpf_vertical_8_neon|
    ARM

    AREA ||.text||, CODE, READONLY, ALIGN=2

-; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
 ; works on 16 iterations at a time.
 ;
-; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
+; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
 ;                                const uint8_t *thresh)
@@ -28,7 +29,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_horizontal_8_neon| PROC
+|aom_lpf_horizontal_8_neon| PROC
    push        {r4-r5, lr}

    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -53,7 +54,7 @@
    sub         r3, r3, r1, lsl #1
    sub         r2, r2, r1, lsl #2

-    bl          vpx_mbloop_filter_neon
+    bl          aom_mbloop_filter_neon

    vst1.u8     {d0}, [r2@64], r1          ; store op2
    vst1.u8     {d1}, [r3@64], r1          ; store op1
@@ -64,41 +65,9 @@

    pop         {r4-r5, pc}

-    ENDP        ; |vpx_lpf_horizontal_8_neon|
+    ENDP        ; |aom_lpf_horizontal_8_neon|

-;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
-;                                    int p,
-;                                    const uint8_t *blimit0,
-;                                    const uint8_t *limit0,
-;                                    const uint8_t *thresh0,
-;                                    const uint8_t *blimit1,
-;                                    const uint8_t *limit1,
-;                                    const uint8_t *thresh1)
-; r0      uint8_t *s,
-; r1      int p, /* pitch */
-; r2      const uint8_t *blimit0,
-; r3      const uint8_t *limit0,
-; sp      const uint8_t *thresh0,
-; sp + 4  const uint8_t *blimit1,
-; sp + 8  const uint8_t *limit1,
-; sp + 12 const uint8_t *thresh1,
-|vpx_lpf_horizontal_8_dual_neon| PROC
-    push        {r0-r1, lr}
-    ldr         lr, [sp, #12]
-    push        {lr}                       ; thresh0
-    bl          vpx_lpf_horizontal_8_neon
-
-    ldr         r2, [sp, #20]              ; blimit1
-    ldr         r3, [sp, #24]              ; limit1
-    ldr         lr, [sp, #28]
-    str         lr, [sp, #16]              ; thresh1
-    add         sp, #4
-    pop         {r0-r1, lr}
-    add         r0, #8                     ; s + 8
-    b           vpx_lpf_horizontal_8_neon
-    ENDP        ; |vpx_lpf_horizontal_8_dual_neon|
-
-; void vpx_lpf_vertical_8_neon(uint8_t *s,
+; void aom_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -109,7 +78,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_vertical_8_neon| PROC
+|aom_lpf_vertical_8_neon| PROC
    push        {r4-r5, lr}

    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -148,7 +117,7 @@
    sub         r2, r0, #3
    add         r3, r0, #1

-    bl          vpx_mbloop_filter_neon
+    bl          aom_mbloop_filter_neon

    ;store op2, op1, op0, oq0
    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
@@ -171,41 +140,9 @@
    vst2.8      {d4[7], d5[7]}, [r3]

    pop         {r4-r5, pc}
-    ENDP        ; |vpx_lpf_vertical_8_neon|
+    ENDP        ; |aom_lpf_vertical_8_neon|

-;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
-;                                  int pitch,
-;                                  const uint8_t *blimit0,
-;                                  const uint8_t *limit0,
-;                                  const uint8_t *thresh0,
-;                                  const uint8_t *blimit1,
-;                                  const uint8_t *limit1,
-;                                  const uint8_t *thresh1)
-; r0      uint8_t *s,
-; r1      int pitch
-; r2      const uint8_t *blimit0,
-; r3      const uint8_t *limit0,
-; sp      const uint8_t *thresh0,
-; sp + 4  const uint8_t *blimit1,
-; sp + 8  const uint8_t *limit1,
-; sp + 12 const uint8_t *thresh1,
-|vpx_lpf_vertical_8_dual_neon| PROC
-    push        {r0-r1, lr}
-    ldr         lr, [sp, #12]
-    push        {lr}                       ; thresh0
-    bl          vpx_lpf_vertical_8_neon
-
-    ldr         r2, [sp, #20]              ; blimit1
-    ldr         r3, [sp, #24]              ; limit1
-    ldr         lr, [sp, #28]
-    str         lr, [sp, #16]              ; thresh1
-    add         sp, #4
-    pop         {r0-r1, lr}
-    add         r0, r1, lsl #3             ; s + 8 * pitch
-    b           vpx_lpf_vertical_8_neon
-    ENDP        ; |vpx_lpf_vertical_8_dual_neon|
-
-; void vpx_mbloop_filter_neon();
+; void aom_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -231,7 +168,7 @@
 ; d3    oq0
 ; d4    oq1
 ; d5    oq2
-|vpx_mbloop_filter_neon| PROC
+|aom_mbloop_filter_neon| PROC
    ; filter_mask
    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
@@ -486,6 +423,6 @@ filter_branch_only

    bx          lr

-    ENDP        ; |vpx_mbloop_filter_neon|
+    ENDP        ; |aom_mbloop_filter_neon|

    END
--- a/aom_dsp/arm/loopfilter_8_neon.c
+++ b/aom_dsp/arm/loopfilter_8_neon.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
+                                      uint8x8_t dlimit,    // limit
+                                      uint8x8_t dthresh,   // thresh
+                                      uint8x8_t d3u8,      // p2
+                                      uint8x8_t d4u8,      // p2
+                                      uint8x8_t d5u8,      // p1
+                                      uint8x8_t d6u8,      // p0
+                                      uint8x8_t d7u8,      // q0
+                                      uint8x8_t d16u8,     // q1
+                                      uint8x8_t d17u8,     // q2
+                                      uint8x8_t d18u8,     // q3
+                                      uint8x8_t *d0ru8,    // p1
+                                      uint8x8_t *d1ru8,    // p1
+                                      uint8x8_t *d2ru8,    // p0
+                                      uint8x8_t *d3ru8,    // q0
+                                      uint8x8_t *d4ru8,    // q1
+                                      uint8x8_t *d5ru8) {  // q1
+  uint32_t flat;
+  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+  int16x8_t q15s16;
+  uint16x8_t q10u16, q14u16;
+  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+  d19u8 = vabd_u8(d3u8, d4u8);
+  d20u8 = vabd_u8(d4u8, d5u8);
+  d21u8 = vabd_u8(d5u8, d6u8);
+  d22u8 = vabd_u8(d16u8, d7u8);
+  d23u8 = vabd_u8(d17u8, d16u8);
+  d24u8 = vabd_u8(d18u8, d17u8);
+
+  d19u8 = vmax_u8(d19u8, d20u8);
+  d20u8 = vmax_u8(d21u8, d22u8);
+
+  d25u8 = vabd_u8(d6u8, d4u8);
+
+  d23u8 = vmax_u8(d23u8, d24u8);
+
+  d26u8 = vabd_u8(d7u8, d17u8);
+
+  d19u8 = vmax_u8(d19u8, d20u8);
+
+  d24u8 = vabd_u8(d6u8, d7u8);
+  d27u8 = vabd_u8(d3u8, d6u8);
+  d28u8 = vabd_u8(d18u8, d7u8);
+
+  d19u8 = vmax_u8(d19u8, d23u8);
+
+  d23u8 = vabd_u8(d5u8, d16u8);
+  d24u8 = vqadd_u8(d24u8, d24u8);
+
+  d19u8 = vcge_u8(dlimit, d19u8);
+
+  d25u8 = vmax_u8(d25u8, d26u8);
+  d26u8 = vmax_u8(d27u8, d28u8);
+
+  d23u8 = vshr_n_u8(d23u8, 1);
+
+  d25u8 = vmax_u8(d25u8, d26u8);
+
+  d24u8 = vqadd_u8(d24u8, d23u8);
+
+  d20u8 = vmax_u8(d20u8, d25u8);
+
+  d23u8 = vdup_n_u8(1);
+  d24u8 = vcge_u8(dblimit, d24u8);
+
+  d21u8 = vcgt_u8(d21u8, dthresh);
+
+  d20u8 = vcge_u8(d23u8, d20u8);
+
+  d19u8 = vand_u8(d19u8, d24u8);
+
+  d23u8 = vcgt_u8(d22u8, dthresh);
+
+  d20u8 = vand_u8(d20u8, d19u8);
+
+  d22u8 = vdup_n_u8(0x80);
+
+  d23u8 = vorr_u8(d21u8, d23u8);
+
+  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
+
+  d30u8 = vshrn_n_u16(q10u16, 4);
+  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
+    d27u8 = vdup_n_u8(3);
+    d21u8 = vdup_n_u8(2);
+    q14u16 = vaddl_u8(d6u8, d7u8);
+    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+    *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vsubw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vsubw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+  } else {
+    d21u8 = veor_u8(d7u8, d22u8);
+    d24u8 = veor_u8(d6u8, d22u8);
+    d25u8 = veor_u8(d5u8, d22u8);
+    d26u8 = veor_u8(d16u8, d22u8);
+
+    d27u8 = vdup_n_u8(3);
+
+    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+    q15s16 = vaddw_s8(q15s16, d29s8);
+
+    d29u8 = vdup_n_u8(4);
+
+    d28s8 = vqmovn_s16(q15s16);
+
+    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+    d30s8 = vshr_n_s8(d30s8, 3);
+    d29s8 = vshr_n_s8(d29s8, 3);
+
+    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+    d29s8 = vrshr_n_s8(d29s8, 1);
+    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+    if (flat == 0) {  // filter_branch_only
+      *d0ru8 = d4u8;
+      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+      *d5ru8 = d17u8;
+      return;
+    }
+
+    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+    d23u8 = vdup_n_u8(2);
+    q14u16 = vaddl_u8(d6u8, d7u8);
+    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+    q14u16 = vaddw_u8(q14u16, d5u8);
+
+    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+    d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vaddw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+
+    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+    d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vaddw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+
+    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+    d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d3u8);
+    q14u16 = vsubw_u8(q14u16, d6u8);
+    q14u16 = vaddw_u8(q14u16, d7u8);
+
+    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+    q14u16 = vaddw_u8(q14u16, d18u8);
+
+    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+    d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d4u8);
+    q14u16 = vsubw_u8(q14u16, d7u8);
+    q14u16 = vaddw_u8(q14u16, d16u8);
+
+    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+    q14u16 = vaddw_u8(q14u16, d18u8);
+
+    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+    d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+    q14u16 = vsubw_u8(q14u16, d5u8);
+    q14u16 = vsubw_u8(q14u16, d16u8);
+    q14u16 = vaddw_u8(q14u16, d17u8);
+    q14u16 = vaddw_u8(q14u16, d18u8);
+
+    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+    d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+  }
+  return;
+}
+
+void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s, *psrc;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  uint8x8_t d16u8, d17u8, d18u8;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  psrc = src - (pitch << 2);
+  for (i = 0; i < 1; i++) {
+    s = psrc + i * 8;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+                       &d5u8);
+
+    s -= (pitch * 6);
+    vst1_u8(s, d0u8);
+    s += pitch;
+    vst1_u8(s, d1u8);
+    s += pitch;
+    vst1_u8(s, d2u8);
+    s += pitch;
+    vst1_u8(s, d3u8);
+    s += pitch;
+    vst1_u8(s, d4u8);
+    s += pitch;
+    vst1_u8(s, d5u8);
+  }
+  return;
+}
+
+void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  uint8_t *s;
+  uint8x8_t dblimit, dlimit, dthresh;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  uint8x8_t d16u8, d17u8, d18u8;
+  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+  uint8x8x4_t d4Result;
+  uint8x8x2_t d2Result;
+
+  dblimit = vld1_u8(blimit);
+  dlimit = vld1_u8(limit);
+  dthresh = vld1_u8(thresh);
+
+  for (i = 0; i < 1; i++) {
+    s = src + (i * (pitch << 3)) - 4;
+
+    d3u8 = vld1_u8(s);
+    s += pitch;
+    d4u8 = vld1_u8(s);
+    s += pitch;
+    d5u8 = vld1_u8(s);
+    s += pitch;
+    d6u8 = vld1_u8(s);
+    s += pitch;
+    d7u8 = vld1_u8(s);
+    s += pitch;
+    d16u8 = vld1_u8(s);
+    s += pitch;
+    d17u8 = vld1_u8(s);
+    s += pitch;
+    d18u8 = vld1_u8(s);
+
+    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
+    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
+    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
+    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
+
+    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                      vreinterpret_u16_u32(d2tmp2.val[0]));
+    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                      vreinterpret_u16_u32(d2tmp3.val[0]));
+    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                      vreinterpret_u16_u32(d2tmp2.val[1]));
+    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                      vreinterpret_u16_u32(d2tmp3.val[1]));
+
+    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                     vreinterpret_u8_u16(d2tmp5.val[0]));
+    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                     vreinterpret_u8_u16(d2tmp5.val[1]));
+    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                      vreinterpret_u8_u16(d2tmp7.val[0]));
+    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                      vreinterpret_u8_u16(d2tmp7.val[1]));
+
+    d3u8 = d2tmp8.val[0];
+    d4u8 = d2tmp8.val[1];
+    d5u8 = d2tmp9.val[0];
+    d6u8 = d2tmp9.val[1];
+    d7u8 = d2tmp10.val[0];
+    d16u8 = d2tmp10.val[1];
+    d17u8 = d2tmp11.val[0];
+    d18u8 = d2tmp11.val[1];
+
+    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
+                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
+                       &d5u8);
+
+    d4Result.val[0] = d0u8;
+    d4Result.val[1] = d1u8;
+    d4Result.val[2] = d2u8;
+    d4Result.val[3] = d3u8;
+
+    d2Result.val[0] = d4u8;
+    d2Result.val[1] = d5u8;
+
+    s = src - 3;
+    vst4_lane_u8(s, d4Result, 0);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 1);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 2);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 3);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 4);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 5);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 6);
+    s += pitch;
+    vst4_lane_u8(s, d4Result, 7);
+
+    s = src + 1;
+    vst2_lane_u8(s, d2Result, 0);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 1);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 2);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 3);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 4);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 5);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 6);
+    s += pitch;
+    vst2_lane_u8(s, d2Result, 7);
+  }
+  return;
+}
--- a/aom_dsp/arm/loopfilter_mb_neon.asm
+++ b/aom_dsp/arm/loopfilter_mb_neon.asm
@@ -1,17 +1,19 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;

-    EXPORT  |vpx_lpf_horizontal_16_neon|
-    EXPORT  |vpx_lpf_horizontal_16_dual_neon|
-    EXPORT  |vpx_lpf_vertical_16_neon|
-    EXPORT  |vpx_lpf_vertical_16_dual_neon|
+;
+
+    EXPORT  |aom_lpf_horizontal_edge_8_neon|
+    EXPORT  |aom_lpf_horizontal_edge_16_neon|
+    EXPORT  |aom_lpf_vertical_16_neon|
    ARM

    AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -56,7 +58,7 @@ h_count
    vld1.u8     {d14}, [r8@64], r1         ; q6
    vld1.u8     {d15}, [r8@64], r1         ; q7

-    bl          vpx_wide_mbfilter_neon
+    bl          aom_wide_mbfilter_neon

    tst         r7, #1
    beq         h_mbfilter
@@ -119,7 +121,7 @@ h_next

    ENDP        ; |mb_lpf_horizontal_edge|

-; void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch,
+; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
 ;                                     const uint8_t *blimit,
 ;                                     const uint8_t *limit,
 ;                                     const uint8_t *thresh)
@@ -128,12 +130,12 @@ h_next
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_16_neon| PROC
+|aom_lpf_horizontal_edge_8_neon| PROC
    mov r12, #1
    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_16_neon|
+    ENDP        ; |aom_lpf_horizontal_edge_8_neon|

-; void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch,
+; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
 ;                                      const uint8_t *blimit,
 ;                                      const uint8_t *limit,
 ;                                      const uint8_t *thresh)
@@ -142,26 +144,25 @@ h_next
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_16_dual_neon| PROC
+|aom_lpf_horizontal_edge_16_neon| PROC
    mov r12, #2
    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_16_dual_neon|
+    ENDP        ; |aom_lpf_horizontal_edge_16_neon|

-; void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
-;                             const uint8_t *limit, const uint8_t *thresh,
-;                             int count) {
+; void aom_lpf_vertical_16_neon(uint8_t *s, int p,
+;                               const uint8_t *blimit,
+;                               const uint8_t *limit,
+;                               const uint8_t *thresh)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; r12   int count
-|mb_lpf_vertical_edge_w| PROC
+|aom_lpf_vertical_16_neon| PROC
    push        {r4-r8, lr}
    vpush       {d8-d15}
    ldr         r4, [sp, #88]              ; load thresh

-v_count
    vld1.8      {d16[]}, [r2]              ; load *blimit
    vld1.8      {d17[]}, [r3]              ; load *limit
    vld1.8      {d18[]}, [r4]              ; load *thresh
@@ -207,28 +208,27 @@ v_count
    vtrn.8      d12, d13
    vtrn.8      d14, d15

-    bl          vpx_wide_mbfilter_neon
+    bl          aom_wide_mbfilter_neon

    tst         r7, #1
    beq         v_mbfilter

    ; flat && mask were not set for any of the channels. Just store the values
    ; from filter.
-    sub         r0, #2
+    sub         r8, r0, #2

    vswp        d23, d25

-    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r0], r1
-    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r0], r1
-    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r0], r1
-    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r0], r1
-    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r0], r1
-    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r0], r1
-    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r0], r1
-    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r0], r1
-    add         r0, #2
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1

-    b           v_next
+    b           v_end

 v_mbfilter
    tst         r7, #2
@@ -255,7 +255,7 @@ v_mbfilter
    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1

-    b           v_next
+    b           v_end

 v_wide_mbfilter
    sub         r8, r0, #8
@@ -307,41 +307,13 @@ v_wide_mbfilter
    vst1.8      {d19}, [r8@64], r1
    vst1.8      {d15}, [r0@64], r1

-v_next
-    subs        r12, #1
-    bne         v_count
-
+v_end
    vpop        {d8-d15}
    pop         {r4-r8, pc}

-    ENDP        ; |mb_lpf_vertical_edge_w|
+    ENDP        ; |aom_lpf_vertical_16_neon|

-; void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
-;                               const uint8_t *limit, const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_vertical_16_neon| PROC
-    mov r12, #1
-    b mb_lpf_vertical_edge_w
-    ENDP        ; |vpx_lpf_vertical_16_neon|
-
-; void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-;                                    const uint8_t *limit,
-;                                    const uint8_t *thresh)
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh
-|vpx_lpf_vertical_16_dual_neon| PROC
-    mov r12, #2
-    b mb_lpf_vertical_edge_w
-    ENDP        ; |vpx_lpf_vertical_16_dual_neon|
-
-; void vpx_wide_mbfilter_neon();
+; void aom_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
 ;
@@ -365,7 +337,7 @@ v_next
 ; d13   q5
 ; d14   q6
 ; d15   q7
-|vpx_wide_mbfilter_neon| PROC
+|aom_wide_mbfilter_neon| PROC
    mov         r7, #0

    ; filter_mask
@@ -661,6 +633,6 @@ v_next
    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)

    bx          lr
-    ENDP        ; |vpx_wide_mbfilter_neon|
+    ENDP        ; |aom_wide_mbfilter_neon|

    END
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+#if HAVE_NEON_ASM
+void aom_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // HAVE_NEON_ASM
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
+                &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
+                &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
+                &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
+                &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo =
+        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo =
+        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo =
+        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo =
+        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
--- a/aom_dsp/arm/sad_media.asm
+++ b/aom_dsp/arm/sad_media.asm
@@ -0,0 +1,98 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+    EXPORT  |aom_sad16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+|aom_sad16x16_media| PROC
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+
+unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 15; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
+                             unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x2_t d1;
+  uint64x1_t d3;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 3; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  d1 = vpaddl_u16(vget_low_u16(q12));
+  d3 = vpaddl_u32(d1);
+
+  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x16_t q0, q4;
+  uint16x8_t q12, q13;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  q0 = vld1q_u8(src_ptr);
+  src_ptr += src_stride;
+  q4 = vld1q_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+  for (i = 0; i < 7; i++) {
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+  }
+
+  q12 = vaddq_u16(q12, q13);
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+                            vget_low_u8(vec_ref_32));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+                            vget_high_u8(vec_ref_32));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+                            vget_low_u8(vec_ref_48));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+                            vget_high_u8(vec_ref_48));
+  }
+  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+}
+
+unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref = vld1q_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo =
+        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
+    vec_accum_hi =
+        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum = vdupq_n_u16(0);
+
+  for (i = 0; i < 8; ++i) {
+    const uint8x8_t vec_src = vld1_u8(src);
+    const uint8x8_t vec_ref = vld1_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  }
+  return horizontal_add_16x8(vec_accum);
+}
--- a/aom_dsp/arm/save_reg_neon.asm
+++ b/aom_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,39 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+    EXPORT  |aom_push_neon|
+    EXPORT  |aom_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|aom_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|aom_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
--- a/aom_dsp/arm/subpel_variance_media.c
+++ b/aom_dsp/arm/subpel_variance_media.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
+                                                      { 96, 32 }, { 80, 48 },
+                                                      { 64, 64 }, { 48, 80 },
+                                                      { 32, 96 }, { 16, 112 } };
+
+extern void aom_filter_block2d_bil_first_pass_media(
+    const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
+    uint32_t height, uint32_t width, const int16_t *filter);
+
+extern void aom_filter_block2d_bil_second_pass_media(
+    const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
+    uint32_t height, uint32_t width, const int16_t *filter);
+
+unsigned int aom_sub_pixel_variance8x8_media(
+    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+  uint16_t first_pass[10 * 8];
+  uint8_t second_pass[8 * 8];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = bilinear_filters_media[xoffset];
+  VFilter = bilinear_filters_media[yoffset];
+
+  aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                          src_pixels_per_line, 9, 8, HFilter);
+  aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
+                                           VFilter);
+
+  return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
+                               sse);
+}
+
+unsigned int aom_sub_pixel_variance16x16_media(
+    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
+    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
+  uint16_t first_pass[36 * 16];
+  uint8_t second_pass[20 * 16];
+  const int16_t *HFilter, *VFilter;
+  unsigned int var;
+
+  if (xoffset == 4 && yoffset == 0) {
+    var = aom_variance_halfpixvar16x16_h_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  } else if (xoffset == 0 && yoffset == 4) {
+    var = aom_variance_halfpixvar16x16_v_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  } else if (xoffset == 4 && yoffset == 4) {
+    var = aom_variance_halfpixvar16x16_hv_media(
+        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  } else {
+    HFilter = bilinear_filters_media[xoffset];
+    VFilter = bilinear_filters_media[yoffset];
+
+    aom_filter_block2d_bil_first_pass_media(
+        src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
+    aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
+                                             16, VFilter);
+
+    var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
+                                  sse);
+  }
+  return var;
+}
+#endif  // HAVE_MEDIA
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 16) {
+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+    }
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *dst, int dst_stride,
+                                            unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
+                            bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
+                            bilinear_filters[yoffset]);
+  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
+                             bilinear_filters[yoffset]);
+  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
+                             bilinear_filters[yoffset]);
+  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
+                             bilinear_filters[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
+                             bilinear_filters[yoffset]);
+  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}
--- a/aom_dsp/arm/subtract_neon.c
+++ b/aom_dsp/arm/subtract_neon.c
@@ -1,19 +1,20 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

 #include <arm_neon.h>

-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"

-void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
+void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
                             ptrdiff_t diff_stride, const uint8_t *src,
                             ptrdiff_t src_stride, const uint8_t *pred,
                             ptrdiff_t pred_stride) {
--- a/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
@@ -0,0 +1,185 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+    EXPORT  |aom_variance_halfpixvar16x16_h_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|aom_variance_halfpixvar16x16_h_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- a/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
@@ -0,0 +1,225 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+    EXPORT  |aom_variance_halfpixvar16x16_hv_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|aom_variance_halfpixvar16x16_hv_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
--- a/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
@@ -0,0 +1,187 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+    EXPORT  |aom_variance_halfpixvar16x16_v_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|aom_variance_halfpixvar16x16_v_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
--- a/aom_dsp/arm/variance_media.asm
+++ b/aom_dsp/arm/variance_media.asm
@@ -0,0 +1,361 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+    EXPORT  |aom_variance16x16_media|
+    EXPORT  |aom_variance8x8_media|
+    EXPORT  |aom_mse16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|aom_variance16x16_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop16x16
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop16x16
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|aom_variance8x8_media| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop8x8
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop8x8
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on aom_variance16x16_media. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|aom_mse16x16_media| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loopmse
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loopmse
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, uint32_t *sse,
+                             int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo =
+          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+      v_sse_hi =
+          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                        int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
+}
+
+unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
+}
+
+unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
+                   32, 32, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+}
+
+unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 4; i++) {
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  uint8x8_t d0u8, d2u8, d4u8, d6u8;
+  int16x4_t d22s16, d23s16, d24s16, d25s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint16x8_t q11u16, q12u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d2u8, d6u8);
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
+                               const unsigned char *ref_ptr, int recon_stride,
+                               unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  int64x1_t d0s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  q7s32 = vdupq_n_s32(0);
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q10s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q10s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride) {
+  int16x4_t d22s16, d24s16, d26s16, d28s16;
+  int64x1_t d0s64;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  d0u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d4u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d1u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d5u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d2u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d6u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d3u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d7u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+
+  q11u16 = vsubl_u8(d0u8, d4u8);
+  q12u16 = vsubl_u8(d1u8, d5u8);
+  q13u16 = vsubl_u8(d2u8, d6u8);
+  q14u16 = vsubl_u8(d3u8, d7u8);
+
+  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+  q7s32 = vmull_s16(d22s16, d22s16);
+  q8s32 = vmull_s16(d24s16, d24s16);
+  q9s32 = vmull_s16(d26s16, d26s16);
+  q10s32 = vmull_s16(d28s16, d28s16);
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q9s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q9s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -1,40 +1,41 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <stdlib.h>

-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"

-unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
-  for (i = 0; i < 8; ++i, s += p)
-    for (j = 0; j < 8; sum += s[j], ++j) {
+  for (i = 0; i < 8; ++i, src += stride)
+    for (j = 0; j < 8; sum += src[j], ++j) {
    }

-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }

-unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
-  for (i = 0; i < 4; ++i, s += p)
-    for (j = 0; j < 4; sum += s[j], ++j) {
+  for (i = 0; i < 4; ++i, src += stride)
+    for (j = 0; j < 4; sum += src[j], ++j) {
    }

-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }

 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
                          int16_t *coeff) {
  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
@@ -66,11 +67,10 @@ static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,

 // The order of the output coeff of the hadamard is not important. For
 // optimization purposes the final transpose may be skipped.
-void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
-                        tran_low_t *coeff) {
+void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
+                        int16_t *coeff) {
  int idx;
  int16_t buffer[64];
-  int16_t buffer2[64];
  int16_t *tmp_buf = &buffer[0];
  for (idx = 0; idx < 8; ++idx) {
    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
@@ -81,38 +81,36 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,

  tmp_buf = &buffer[0];
  for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
                                       // dynamic range [-2040, 2040]
-    // buffer2: 15 bit
+    coeff += 8;                        // coeff: 15 bit
                                       // dynamic range [-16320, 16320]
    ++tmp_buf;
  }
-
-  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
 }

 // In place 16x16 2D Hadamard transform
-void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
-                          tran_low_t *coeff) {
+void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
+                          int16_t *coeff) {
  int idx;
  for (idx = 0; idx < 4; ++idx) {
    // src_diff: 9 bit, dynamic range [-255, 255]
    const int16_t *src_ptr =
        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
  }

  // coeff: 15 bit, dynamic range [-16320, 16320]
  for (idx = 0; idx < 64; ++idx) {
-    tran_low_t a0 = coeff[0];
-    tran_low_t a1 = coeff[64];
-    tran_low_t a2 = coeff[128];
-    tran_low_t a3 = coeff[192];
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];

-    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    tran_low_t b3 = (a2 - a3) >> 1;
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;

    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
    coeff[64] = b1 + b3;
@@ -125,7 +123,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,

 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_c(const tran_low_t *coeff, int length) {
+int aom_satd_c(const int16_t *coeff, int length) {
  int i;
  int satd = 0;
  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
@@ -136,7 +134,7 @@ int vpx_satd_c(const tran_low_t *coeff, int length) {

 // Integer projection onto row vectors.
 // height: value range {16, 32, 64}.
-void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                       const int ref_stride, const int height) {
  int idx;
  const int norm_factor = height >> 1;
@@ -152,7 +150,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
 }

 // width: value range {16, 32, 64}.
-int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
+int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
  int idx;
  int16_t sum = 0;
  // sum: 14 bit, dynamic range [0, 16320]
@@ -163,7 +161,7 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
 // ref: [0 - 510]
 // src: [0 - 510]
 // bwl: {2, 3, 4}
-int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
  int i;
  int width = 4 << bwl;
  int sse = 0, mean = 0, var;
@@ -179,44 +177,44 @@ int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
  return var;
 }

-void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
-                      int *min, int *max) {
+void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
+                      int ref_stride, int *min, int *max) {
  int i, j;
  *min = 255;
  *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
+  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
    for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j] - d[j]);
+      int diff = abs(src[j] - ref[j]);
      *min = diff < *min ? diff : *min;
      *max = diff > *max ? diff : *max;
    }
  }
 }

-#if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+#if CONFIG_AOM_HIGHBITDEPTH
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
-  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 8; ++i, s += p)
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 8; ++i, s += stride)
    for (j = 0; j < 8; sum += s[j], ++j) {
    }

-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }

-unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
-  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 4; ++i, s += p)
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 4; ++i, s += stride)
    for (j = 0; j < 4; sum += s[j], ++j) {
    }

-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }

-void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
                             int dp, int *min, int *max) {
  int i, j;
  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
@@ -231,4 +229,4 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
    }
  }
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BITREADER_H_
+#define AOM_DSP_BITREADER_H_
+
+#include <assert.h>
+#include <limits.h>
+
+#include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
+#endif
+
+#include "aom/aomdx.h"
+#include "aom/aom_integer.h"
+#if CONFIG_ANS
+#include "aom_dsp/ansreader.h"
+#elif CONFIG_DAALA_EC
+#include "aom_dsp/daalaboolreader.h"
+#else
+#include "aom_dsp/dkboolreader.h"
+#endif
+#include "aom_dsp/prob.h"
+#include "av1/common/odintrin.h"
+
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#define ACCT_STR_NAME acct_str
+#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
+#define ACCT_STR_ARG(s) , s
+#else
+#define ACCT_STR_PARAM
+#define ACCT_STR_ARG(s)
+#endif
+
+#define aom_read(r, prob, ACCT_STR_NAME) \
+  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_bit(r, ACCT_STR_NAME) \
+  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
+  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_literal(r, bits, ACCT_STR_NAME) \
+  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree_bits(r, tree, probs, ACCT_STR_NAME) \
+  aom_read_tree_bits_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_ANS
+typedef struct AnsDecoder aom_reader;
+#elif CONFIG_DAALA_EC
+typedef struct daala_reader aom_reader;
+#else
+typedef struct aom_dk_reader aom_reader;
+#endif
+
+static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
+                                  size_t size, aom_decrypt_cb decrypt_cb,
+                                  void *decrypt_state) {
+#if CONFIG_ANS
+  (void)decrypt_cb;
+  (void)decrypt_state;
+  assert(size <= INT_MAX);
+  return ans_read_init(r, buffer, size);
+#elif CONFIG_DAALA_EC
+  (void)decrypt_cb;
+  (void)decrypt_state;
+  return aom_daala_reader_init(r, buffer, size);
+#else
+  return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state);
+#endif
+}
+
+static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
+#if CONFIG_ANS
+  (void)r;
+  assert(0 && "Use the raw buffer size with ANS");
+  return NULL;
+#elif CONFIG_DAALA_EC
+  return aom_daala_reader_find_end(r);
+#else
+  return aom_dk_reader_find_end(r);
+#endif
+}
+
+static INLINE int aom_reader_has_error(aom_reader *r) {
+#if CONFIG_ANS
+  return ans_reader_has_error(r);
+#elif CONFIG_DAALA_EC
+  return aom_daala_reader_has_error(r);
+#else
+  return aom_dk_reader_has_error(r);
+#endif
+}
+
+// Returns the position in the bit reader in bits.
+static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
+#if CONFIG_ANS
+  (void)r;
+  assert(0 && "aom_reader_tell() is unimplemented for ANS");
+  return 0;
+#elif CONFIG_DAALA_EC
+  return aom_daala_reader_tell(r);
+#else
+  return aom_dk_reader_tell(r);
+#endif
+}
+
+// Returns the position in the bit reader in 1/8th bits.
+static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
+#if CONFIG_ANS
+  (void)r;
+  assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
+  return 0;
+#elif CONFIG_DAALA_EC
+  return aom_daala_reader_tell_frac(r);
+#else
+  return aom_dk_reader_tell_frac(r);
+#endif
+}
+
+#if CONFIG_ACCOUNTING
+static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+  if (r->accounting != NULL) {
+    uint32_t tell_frac;
+    tell_frac = aom_reader_tell_frac(r);
+    aom_accounting_record(r->accounting, ACCT_STR_NAME,
+                          tell_frac - r->accounting->last_tell_frac);
+    r->accounting->last_tell_frac = tell_frac;
+  }
+}
+#endif
+
+static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+  int ret;
+#if CONFIG_ANS
+  ret = uabs_read(r, prob);
+#elif CONFIG_DAALA_EC
+  ret = aom_daala_read(r, prob);
+#else
+  ret = aom_dk_read(r, prob);
+#endif
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+  int ret;
+#if CONFIG_ANS
+  ret = uabs_read_bit(r);  // Non trivial optimization at half probability
+#else
+  ret = aom_read(r, 128, NULL);  // aom_prob_half
+#endif
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
+  int literal = 0, bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return literal;
+}
+
+static INLINE int aom_read_tree_bits_(aom_reader *r, const aom_tree_index *tree,
+                                      const aom_prob *probs ACCT_STR_PARAM) {
+  aom_tree_index i = 0;
+
+  while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return -i;
+}
+
+static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
+                                 const aom_prob *probs ACCT_STR_PARAM) {
+  int ret;
+#if CONFIG_DAALA_EC
+  ret = daala_read_tree_bits(r, tree, probs);
+#else
+  ret = aom_read_tree_bits(r, tree, probs, NULL);
+#endif
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+
+#if CONFIG_EC_MULTISYMBOL
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
+                                   int nsymbs ACCT_STR_PARAM) {
+  int ret;
+#if CONFIG_RANS
+  (void)nsymbs;
+  ret = rans_read(r, cdf);
+#elif CONFIG_DAALA_EC
+  ret = daala_read_symbol(r, cdf, nsymbs);
+#else
+#error \
+    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+  "coder. Enable daala_ec or ans for a valid configuration."
+#endif
+
+#if CONFIG_EC_ADAPT
+  update_cdf(cdf, ret, nsymbs);
+#endif
+
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+#endif  // CONFIG_EC_MULTISYMBOL
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_DSP_BITREADER_H_
--- a/aom_dsp/bitreader_buffer.c
+++ b/aom_dsp/bitreader_buffer.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "./aom_config.h"
+#include "./bitreader_buffer.h"
+
+size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
+  return (rb->bit_offset + 7) >> 3;
+}
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
+  const size_t off = rb->bit_offset;
+  const size_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
+    rb->bit_offset = off + 1;
+    return bit;
+  } else {
+    rb->error_handler(rb->error_handler_data);
+    return 0;
+  }
+}
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
+  return value;
+}
+
+int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+  const int value = aom_rb_read_literal(rb, bits);
+  return aom_rb_read_bit(rb) ? -value : value;
+}
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+  const int nbits = sizeof(unsigned) * 8 - bits - 1;
+  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
+  return ((int)value) >> nbits;
+}
--- a/aom_dsp/bitreader_buffer.h
+++ b/aom_dsp/bitreader_buffer.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BITREADER_BUFFER_H_
+#define AOM_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*aom_rb_error_handler)(void *data);
+
+struct aom_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  size_t bit_offset;
+
+  void *error_handler_data;
+  aom_rb_error_handler error_handler;
+};
+
+size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_DSP_BITREADER_BUFFER_H_
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_BITWRITER_H_
+#define AOM_DSP_BITWRITER_H_
+
+#include <assert.h>
+#include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
+#endif
+
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#elif CONFIG_DAALA_EC
+#include "aom_dsp/daalaboolwriter.h"
+#else
+#include "aom_dsp/dkboolwriter.h"
+#endif
+#include "aom_dsp/prob.h"
+
+#if CONFIG_RD_DEBUG
+#include "av1/encoder/cost.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_ANS
+typedef struct BufAnsCoder aom_writer;
+#elif CONFIG_DAALA_EC
+typedef struct daala_writer aom_writer;
+#else
+typedef struct aom_dk_writer aom_writer;
+#endif
+
+typedef struct TOKEN_STATS { int64_t cost; } TOKEN_STATS;
+
+static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
+#if CONFIG_ANS
+  (void)bc;
+  (void)buffer;
+  assert(0 && "buf_ans requires a more complicated startup procedure");
+#elif CONFIG_DAALA_EC
+  aom_daala_start_encode(bc, buffer);
+#else
+  aom_dk_start_encode(bc, buffer);
+#endif
+}
+
+static INLINE void aom_stop_encode(aom_writer *bc) {
+#if CONFIG_ANS
+  (void)bc;
+  assert(0 && "buf_ans requires a more complicated shutdown procedure");
+#elif CONFIG_DAALA_EC
+  aom_daala_stop_encode(bc);
+#else
+  aom_dk_stop_encode(bc);
+#endif
+}
+
+static INLINE void aom_write(aom_writer *br, int bit, int probability) {
+#if CONFIG_ANS
+  buf_uabs_write(br, bit, probability);
+#elif CONFIG_DAALA_EC
+  aom_daala_write(br, bit, probability);
+#else
+  aom_dk_write(br, bit, probability);
+#endif
+}
+
+static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
+                                    TOKEN_STATS *token_stats) {
+  aom_write(br, bit, probability);
+#if CONFIG_RD_DEBUG
+  token_stats->cost += av1_cost_bit(probability, bit);
+#else
+  (void)token_stats;
+#endif
+}
+
+static INLINE void aom_write_bit(aom_writer *w, int bit) {
+  aom_write(w, bit, 128);  // aom_prob_half
+}
+
+static INLINE void aom_write_bit_record(aom_writer *w, int bit,
+                                        TOKEN_STATS *token_stats) {
+  aom_write_record(w, bit, 128, token_stats);  // aom_prob_half
+}
+
+static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
+}
+
+static INLINE void aom_write_tree_bits(aom_writer *w, const aom_tree_index *tr,
+                                       const aom_prob *probs, int bits, int len,
+                                       aom_tree_index i) {
+  do {
+    const int bit = (bits >> --len) & 1;
+    aom_write(w, bit, probs[i >> 1]);
+    i = tr[i + bit];
+  } while (len);
+}
+
+static INLINE void aom_write_tree_bits_record(aom_writer *w,
+                                              const aom_tree_index *tr,
+                                              const aom_prob *probs, int bits,
+                                              int len, aom_tree_index i,
+                                              TOKEN_STATS *token_stats) {
+  do {
+    const int bit = (bits >> --len) & 1;
+    aom_write_record(w, bit, probs[i >> 1], token_stats);
+    i = tr[i + bit];
+  } while (len);
+}
+
+static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
+                                  const aom_prob *probs, int bits, int len,
+                                  aom_tree_index i) {
+#if CONFIG_DAALA_EC
+  daala_write_tree_bits(w, tree, probs, bits, len, i);
+#else
+  aom_write_tree_bits(w, tree, probs, bits, len, i);
+#endif
+}
+
+static INLINE void aom_write_tree_record(aom_writer *w,
+                                         const aom_tree_index *tree,
+                                         const aom_prob *probs, int bits,
+                                         int len, aom_tree_index i,
+                                         TOKEN_STATS *token_stats) {
+#if CONFIG_DAALA_EC
+  (void)token_stats;
+  daala_write_tree_bits(w, tree, probs, bits, len, i);
+#else
+  aom_write_tree_bits_record(w, tree, probs, bits, len, i, token_stats);
+#endif
+}
+
+#if CONFIG_EC_MULTISYMBOL
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+                                    int nsymbs) {
+#if CONFIG_RANS
+  struct rans_sym s;
+  (void)nsymbs;
+  assert(cdf);
+  s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
+  s.prob = cdf[symb] - s.cum_prob;
+  buf_rans_write(w, &s);
+#elif CONFIG_DAALA_EC
+  daala_write_symbol(w, symb, cdf, nsymbs);
+#else
+#error \
+    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+  "coder. Enable daala_ec or ans for a valid configuration."
+#endif
+
+#if CONFIG_EC_ADAPT
+  update_cdf(cdf, symb, nsymbs);
+#endif
+}
+#endif  // CONFIG_EC_MULTISYMBOL
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_DSP_BITWRITER_H_
--- a/aom_dsp/bitwriter_buffer.c
+++ b/aom_dsp/bitwriter_buffer.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./bitwriter_buffer.h"
+
+size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT - 1) {
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+                                     int bits) {
+  aom_wb_write_literal(wb, data, bits + 1);
+}
--- a/Show More
+++ b/Show More