Merge "Add min_tx_size variable to recursive transform block partition system" into nextgenv2

Merge "Fix the bug that PVQ commit broke dering" into nextgenv2
Merge changes Ib9428dc9,Ide04717a,If1dba7d8,I6da97880 into nextgenv2
2016-11-08 19:14:33 +00:00 · 2016-11-08 18:00:53 +00:00 · 2016-11-08 17:42:04 +00:00 · 2016-11-08 09:36:54 -08:00 · 2016-11-08 08:15:57 -08:00 · 2016-11-07 21:11:31 -08:00
1076 changed files with 176186 additions and 146950 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -1,10 +1,11 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 3.7.1
+# Generated with clang-format 3.8.1
 AccessModifierOffset: -1
-AlignAfterOpenBracket: true
+AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlinesLeft: true
 AlignOperands:   true
 AlignTrailingComments: true
@@ -15,10 +16,23 @@ AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: true
 BinPackArguments: true
 BinPackParameters: true
 BraceWrapping:
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  BeforeCatch:     false
  BeforeElse:      false
  IndentBraces:    false
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
 BreakBeforeTernaryOperators: true
@@ -33,6 +47,13 @@ DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
 IndentCaseLabels: true
 IndentWidth:     2
 IndentWrappedFunctionNames: false
@@ -51,6 +72,8 @@ PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
 SpaceAfterCStyleCast: false
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
--- a/.gitignore
+++ b/.gitignore
@@ -29,37 +29,36 @@
 /examples/decode_with_drops
 /examples/decode_with_partial_drops
 /examples/example_xma
 /examples/lossless_encoder
 /examples/postproc
 /examples/resize_util
 /examples/set_maps
 /examples/simple_decoder
 /examples/simple_encoder
 /examples/twopass_encoder
-/examples/vp8_multi_resolution_encoder
+/examples/aom_cx_set_ref
-/examples/vp8cx_set_ref
+/examples/av1_spatial_scalable_encoder
-/examples/vp9_lossless_encoder
+/examples/aom_temporal_scalable_patterns
-/examples/vp9_spatial_scalable_encoder
+/examples/aom_temporal_svc_encoder
 /examples/vpx_temporal_scalable_patterns
 /examples/vpx_temporal_svc_encoder
 /ivfdec
 /ivfdec.dox
 /ivfenc
 /ivfenc.dox
-/libvpx.so*
+/libaom.so*
-/libvpx.ver
+/libaom.ver
 /samples.dox
 /test_intra_pred_speed
-/test_libvpx
+/test_libaom
-/vp8_api1_migration.dox
+/aom_api1_migration.dox
-/vp[89x]_rtcd.h
+/av1_rtcd.h
-/vpx.pc
+/aom.pc
-/vpx_config.c
+/aom_config.c
-/vpx_config.h
+/aom_config.h
-/vpx_dsp_rtcd.h
+/aom_dsp_rtcd.h
-/vpx_scale_rtcd.h
+/aom_scale_rtcd.h
-/vpx_version.h
+/aom_version.h
-/vpxdec
+/aomdec
-/vpxdec.dox
+/aomdec.dox
-/vpxenc
+/aomenc
-/vpxenc.dox
+/aomenc.dox
 TAGS
--- a/16
+++ b/16
@@ -56,13 +56,16 @@ James Zern <jzern@google.com>
 Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
 Jean-Marc Valin <jmvalin@jmvalin.ca>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
 Jia Jia <jia.jia@linaro.org>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
 Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@chromium.org>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
@@ -89,6 +92,7 @@ Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
 Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@dgql.org>
 Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -97,6 +101,7 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
 Peter de Rivaz <peter.derivaz@argondesign.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
@@ -107,13 +112,16 @@ Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
 Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Steinar Midtskogen <stemidts@cisco.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
@@ -121,14 +129,16 @@ Tamar Levy <tamar.levy@intel.com>
 Tao Bai <michaelbai@chromium.org>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
 Thomas Daede <tdaede@mozilla.com>
 Thomas Davies <thdavies@cisco.com>
 Thomas <thdavies@cisco.com>
 Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Tristan Matthews <tmatth@videolan.org>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
 Yongzhe Wang <yongzhe@google.com>
 Yunqing Wang <yunqingwang@google.com>
 Zoe Liu <zoeliu@google.com>
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
--- a/4
+++ b/4
@@ -1,7 +1,9 @@
 Next Release
  - Incompatible changes:
-    The VP9 encoder's default keyframe interval changed to 128 from 9999.
+    The AV1 encoder's default keyframe interval changed to 128 from 9999.
 2016-04-07 v0.1.0 "AOMedia Codec 1"
  This release is the first Alliance for Open Media codec.
 2015-11-09 v1.5.0 "Javan Whistling Duck"
  This release improves upon the VP9 encoder and speeds up the encoding and
  decoding processes.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,270 @@
 ##
 ## Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 ## was not distributed with this source code in the LICENSE file, you can
 ## obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ## Media Patent License 1.0 was not distributed with this source code in the
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 cmake_minimum_required(VERSION 3.2)
 project(AOM C CXX)
 set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
 set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
 include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
 set(AOM_SRCS
    "${AOM_CONFIG_DIR}/aom_config.c"
    "${AOM_CONFIG_DIR}/aom_config.h"
    "${AOM_ROOT}/aom/aom.h"
    "${AOM_ROOT}/aom/aom_codec.h"
    "${AOM_ROOT}/aom/aom_decoder.h"
    "${AOM_ROOT}/aom/aom_encoder.h"
    "${AOM_ROOT}/aom/aom_frame_buffer.h"
    "${AOM_ROOT}/aom/aom_image.h"
    "${AOM_ROOT}/aom/aom_integer.h"
    "${AOM_ROOT}/aom/aomcx.h"
    "${AOM_ROOT}/aom/aomdx.h"
    "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
    "${AOM_ROOT}/aom/src/aom_codec.c"
    "${AOM_ROOT}/aom/src/aom_decoder.c"
    "${AOM_ROOT}/aom/src/aom_encoder.c"
    "${AOM_ROOT}/aom/src/aom_image.c")
 set(AOM_DSP_SRCS
    "${AOM_ROOT}/aom_dsp/aom_convolve.c"
    "${AOM_ROOT}/aom_dsp/aom_convolve.h"
    "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
    "${AOM_ROOT}/aom_dsp/aom_filter.h"
    "${AOM_ROOT}/aom_dsp/aom_simd.c"
    "${AOM_ROOT}/aom_dsp/aom_simd.h"
    "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
    "${AOM_ROOT}/aom_dsp/avg.c"
    "${AOM_ROOT}/aom_dsp/bitreader.h"
    "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
    "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
    "${AOM_ROOT}/aom_dsp/bitwriter.h"
    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
    "${AOM_ROOT}/aom_dsp/blend.h"
    "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
    "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
    "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
    "${AOM_ROOT}/aom_dsp/dkboolreader.c"
    "${AOM_ROOT}/aom_dsp/dkboolreader.h"
    "${AOM_ROOT}/aom_dsp/dkboolwriter.c"
    "${AOM_ROOT}/aom_dsp/dkboolwriter.h"
    "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
    "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
    "${AOM_ROOT}/aom_dsp/intrapred.c"
    "${AOM_ROOT}/aom_dsp/inv_txfm.c"
    "${AOM_ROOT}/aom_dsp/inv_txfm.h"
    "${AOM_ROOT}/aom_dsp/loopfilter.c"
    "${AOM_ROOT}/aom_dsp/prob.c"
    "${AOM_ROOT}/aom_dsp/prob.h"
    "${AOM_ROOT}/aom_dsp/psnr.c"
    "${AOM_ROOT}/aom_dsp/psnr.h"
    "${AOM_ROOT}/aom_dsp/quantize.c"
    "${AOM_ROOT}/aom_dsp/quantize.h"
    "${AOM_ROOT}/aom_dsp/sad.c"
    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
    "${AOM_ROOT}/aom_dsp/subtract.c"
    "${AOM_ROOT}/aom_dsp/txfm_common.h"
    "${AOM_ROOT}/aom_dsp/variance.c"
    "${AOM_ROOT}/aom_dsp/variance.h")
 set(AOM_MEM_SRCS
    "${AOM_ROOT}/aom_mem/aom_mem.c"
    "${AOM_ROOT}/aom_mem/aom_mem.h"
    "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
 set(AOM_SCALE_SRCS
    "${AOM_ROOT}/aom_scale/aom_scale.h"
    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
    "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
    "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
    "${AOM_ROOT}/aom_scale/generic/yv12config.c"
    "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
    "${AOM_ROOT}/aom_scale/yv12config.h")
 # TODO(tomfinegan): Extract aom_ports from aom_util if possible.
 set(AOM_UTIL_SRCS
    "${AOM_ROOT}/aom_ports/aom_once.h"
    "${AOM_ROOT}/aom_ports/aom_timer.h"
    "${AOM_ROOT}/aom_ports/bitops.h"
    "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
    "${AOM_ROOT}/aom_ports/mem.h"
    "${AOM_ROOT}/aom_ports/mem_ops.h"
    "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
    "${AOM_ROOT}/aom_ports/msvc.h"
    "${AOM_ROOT}/aom_ports/system_state.h"
    "${AOM_ROOT}/aom_util/aom_thread.c"
    "${AOM_ROOT}/aom_util/aom_thread.h"
    "${AOM_ROOT}/aom_util/endian_inl.h")
 set(AOM_AV1_COMMON_SRCS
    "${AOM_ROOT}/av1/av1_iface_common.h"
    "${AOM_ROOT}/av1/common/alloccommon.c"
    "${AOM_ROOT}/av1/common/alloccommon.h"
    "${AOM_ROOT}/av1/common/av1_fwd_txfm.c"
    "${AOM_ROOT}/av1/common/av1_fwd_txfm.h"
    "${AOM_ROOT}/av1/common/av1_inv_txfm.c"
    "${AOM_ROOT}/av1/common/av1_inv_txfm.h"
    "${AOM_ROOT}/av1/common/av1_rtcd.c"
    "${AOM_ROOT}/av1/common/blockd.c"
    "${AOM_ROOT}/av1/common/blockd.h"
    "${AOM_ROOT}/av1/common/common.h"
    "${AOM_ROOT}/av1/common/common_data.h"
    "${AOM_ROOT}/av1/common/convolve.c"
    "${AOM_ROOT}/av1/common/convolve.h"
    "${AOM_ROOT}/av1/common/debugmodes.c"
    "${AOM_ROOT}/av1/common/entropy.c"
    "${AOM_ROOT}/av1/common/entropy.h"
    "${AOM_ROOT}/av1/common/entropymode.c"
    "${AOM_ROOT}/av1/common/entropymode.h"
    "${AOM_ROOT}/av1/common/entropymv.c"
    "${AOM_ROOT}/av1/common/entropymv.h"
    "${AOM_ROOT}/av1/common/enums.h"
    "${AOM_ROOT}/av1/common/filter.c"
    "${AOM_ROOT}/av1/common/filter.h"
    "${AOM_ROOT}/av1/common/frame_buffers.c"
    "${AOM_ROOT}/av1/common/frame_buffers.h"
    "${AOM_ROOT}/av1/common/idct.c"
    "${AOM_ROOT}/av1/common/idct.h"
    "${AOM_ROOT}/av1/common/loopfilter.c"
    "${AOM_ROOT}/av1/common/loopfilter.h"
    "${AOM_ROOT}/av1/common/mv.h"
    "${AOM_ROOT}/av1/common/mvref_common.c"
    "${AOM_ROOT}/av1/common/mvref_common.h"
    "${AOM_ROOT}/av1/common/odintrin.c"
    "${AOM_ROOT}/av1/common/odintrin.h"
    "${AOM_ROOT}/av1/common/onyxc_int.h"
    "${AOM_ROOT}/av1/common/pred_common.c"
    "${AOM_ROOT}/av1/common/pred_common.h"
    "${AOM_ROOT}/av1/common/quant_common.c"
    "${AOM_ROOT}/av1/common/quant_common.h"
    "${AOM_ROOT}/av1/common/reconinter.c"
    "${AOM_ROOT}/av1/common/reconinter.h"
    "${AOM_ROOT}/av1/common/reconintra.c"
    "${AOM_ROOT}/av1/common/reconintra.h"
    "${AOM_ROOT}/av1/common/scale.c"
    "${AOM_ROOT}/av1/common/scale.h"
    "${AOM_ROOT}/av1/common/scan.c"
    "${AOM_ROOT}/av1/common/scan.h"
    "${AOM_ROOT}/av1/common/seg_common.c"
    "${AOM_ROOT}/av1/common/seg_common.h"
    "${AOM_ROOT}/av1/common/thread_common.c"
    "${AOM_ROOT}/av1/common/thread_common.h"
    "${AOM_ROOT}/av1/common/tile_common.c"
    "${AOM_ROOT}/av1/common/tile_common.h")
 set(AOM_AV1_DECODER_SRCS
    "${AOM_ROOT}/av1/av1_dx_iface.c"
    "${AOM_ROOT}/av1/decoder/decodeframe.c"
    "${AOM_ROOT}/av1/decoder/decodeframe.h"
    "${AOM_ROOT}/av1/decoder/decodemv.c"
    "${AOM_ROOT}/av1/decoder/decodemv.h"
    "${AOM_ROOT}/av1/decoder/decoder.c"
    "${AOM_ROOT}/av1/decoder/decoder.h"
    "${AOM_ROOT}/av1/decoder/detokenize.c"
    "${AOM_ROOT}/av1/decoder/detokenize.h"
    "${AOM_ROOT}/av1/decoder/dsubexp.c"
    "${AOM_ROOT}/av1/decoder/dsubexp.h"
    "${AOM_ROOT}/av1/decoder/dthread.c"
    "${AOM_ROOT}/av1/decoder/dthread.h")
 set(AOM_AV1_ENCODER_SRCS
    "${AOM_ROOT}/av1/av1_cx_iface.c"
    "${AOM_ROOT}/av1/encoder/aq_complexity.c"
    "${AOM_ROOT}/av1/encoder/aq_complexity.h"
    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
    "${AOM_ROOT}/av1/encoder/aq_variance.c"
    "${AOM_ROOT}/av1/encoder/aq_variance.h"
    "${AOM_ROOT}/av1/encoder/bitstream.c"
    "${AOM_ROOT}/av1/encoder/bitstream.h"
    "${AOM_ROOT}/av1/encoder/block.h"
    "${AOM_ROOT}/av1/encoder/context_tree.c"
    "${AOM_ROOT}/av1/encoder/context_tree.h"
    "${AOM_ROOT}/av1/encoder/cost.c"
    "${AOM_ROOT}/av1/encoder/cost.h"
    "${AOM_ROOT}/av1/encoder/dct.c"
    "${AOM_ROOT}/av1/encoder/encodeframe.c"
    "${AOM_ROOT}/av1/encoder/encodeframe.h"
    "${AOM_ROOT}/av1/encoder/encodemb.c"
    "${AOM_ROOT}/av1/encoder/encodemb.h"
    "${AOM_ROOT}/av1/encoder/encodemv.c"
    "${AOM_ROOT}/av1/encoder/encodemv.h"
    "${AOM_ROOT}/av1/encoder/encoder.c"
    "${AOM_ROOT}/av1/encoder/encoder.h"
    "${AOM_ROOT}/av1/encoder/ethread.c"
    "${AOM_ROOT}/av1/encoder/ethread.h"
    "${AOM_ROOT}/av1/encoder/extend.c"
    "${AOM_ROOT}/av1/encoder/extend.h"
    "${AOM_ROOT}/av1/encoder/firstpass.c"
    "${AOM_ROOT}/av1/encoder/firstpass.h"
    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
    "${AOM_ROOT}/av1/encoder/lookahead.c"
    "${AOM_ROOT}/av1/encoder/lookahead.h"
    "${AOM_ROOT}/av1/encoder/mbgraph.c"
    "${AOM_ROOT}/av1/encoder/mbgraph.h"
    "${AOM_ROOT}/av1/encoder/mcomp.c"
    "${AOM_ROOT}/av1/encoder/mcomp.h"
    "${AOM_ROOT}/av1/encoder/picklpf.c"
    "${AOM_ROOT}/av1/encoder/picklpf.h"
    "${AOM_ROOT}/av1/encoder/quantize.c"
    "${AOM_ROOT}/av1/encoder/quantize.h"
    "${AOM_ROOT}/av1/encoder/ratectrl.c"
    "${AOM_ROOT}/av1/encoder/ratectrl.h"
    "${AOM_ROOT}/av1/encoder/rd.c"
    "${AOM_ROOT}/av1/encoder/rd.h"
    "${AOM_ROOT}/av1/encoder/rdopt.c"
    "${AOM_ROOT}/av1/encoder/rdopt.h"
    "${AOM_ROOT}/av1/encoder/resize.c"
    "${AOM_ROOT}/av1/encoder/resize.h"
    "${AOM_ROOT}/av1/encoder/segmentation.c"
    "${AOM_ROOT}/av1/encoder/segmentation.h"
    "${AOM_ROOT}/av1/encoder/speed_features.c"
    "${AOM_ROOT}/av1/encoder/speed_features.h"
    "${AOM_ROOT}/av1/encoder/subexp.c"
    "${AOM_ROOT}/av1/encoder/subexp.h"
    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
    "${AOM_ROOT}/av1/encoder/tokenize.c"
    "${AOM_ROOT}/av1/encoder/tokenize.h"
    "${AOM_ROOT}/av1/encoder/treewriter.c"
    "${AOM_ROOT}/av1/encoder/treewriter.h")
 # Targets
 add_library(aom_dsp ${AOM_DSP_SRCS})
 include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
 add_library(aom_mem ${AOM_MEM_SRCS})
 add_library(aom_scale ${AOM_SCALE_SRCS})
 include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
 add_library(aom_util ${AOM_UTIL_SRCS})
 add_library(aom_av1_decoder ${AOM_AV1_DECODER_SRCS})
 add_library(aom_av1_encoder ${AOM_AV1_ENCODER_SRCS})
 add_library(aom ${AOM_SRCS})
 target_link_libraries(aom LINK_PUBLIC
                      aom_dsp
                      aom_mem
                      aom_scale
                      aom_util
                      aom_av1_decoder
                      aom_av1_encoder)
 add_executable(simple_decoder examples/simple_decoder.c)
 include_directories(${AOM_ROOT})
 target_link_libraries(simple_decoder LINK_PUBLIC aom)
 add_executable(simple_encoder examples/simple_encoder.c)
 include_directories(${AOM_ROOT})
 target_link_libraries(simple_encoder LINK_PUBLIC aom)
--- a/42
+++ b/42
@@ -1,31 +1,27 @@
-Copyright (c) 2010, The WebM Project authors. All rights reserved.
+Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
+modification, are permitted provided that the following conditions
-met:
+are met:
-  * Redistributions of source code must retain the above copyright
+1. Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
+   notice, this list of conditions and the following disclaimer.
-  * Redistributions in binary form must reproduce the above copyright
+2. Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer in
+   notice, this list of conditions and the following disclaimer in
-    the documentation and/or other materials provided with the
+   the documentation and/or other materials provided with the
-    distribution.
+   distribution.
  * Neither the name of Google, nor the WebM Project, nor the names
    of its contributors may be used to endorse or promote products
    derived from this software without specific prior written
    permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
--- a/127
+++ b/127
@@ -1,23 +1,108 @@
-Additional IP Rights Grant (Patents)
+Alliance for Open Media Patent License 1.0
 ------------------------------------
-"These implementations" means the copyrightable works that implement the WebM
+1. License Terms.
-codecs distributed by Google as part of the WebM Project.
+
 1.1. Patent License. Subject to the terms and conditions of this License, each
     Licensor, on behalf of itself and successors in interest and assigns,
     grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
     no-charge, royalty-free, irrevocable (except as expressly stated in this
     License) patent license to its Necessary Claims to make, use, sell, offer
     for sale, import or distribute any Implementation.
 1.2. Conditions.
 1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
       sell, offer for sale, import or distribute an Implementation under
       Section 1.1, Licensee must make its Necessary Claims available under
       this License, and must reproduce this License with any Implementation
       as follows:
       a. For distribution in source code, by including this License in the
          root directory of the source code with its Implementation.
       b. For distribution in any other form (including binary, object form,
          and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
          GDSII, etc.)), by including this License in the documentation, legal
          notices, and/or other written materials provided with the
          Implementation.
 1.2.2. Additional Conditions. This license is directly from Licensor to
       Licensee.  Licensee acknowledges as a condition of benefiting from it
       that no rights from Licensor are received from suppliers, distributors,
       or otherwise in connection with this License.
 1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
     initiates patent litigation or files, maintains, or voluntarily
     participates in a lawsuit against another entity or any person asserting
     that any Implementation infringes Necessary Claims, any patent licenses
     granted under this License directly to the Licensee are immediately
     terminated as of the date of the initiation of action unless 1) that suit
     was in response to a corresponding suit regarding an Implementation first
     brought against an initiating entity, or 2) that suit was brought to
     enforce the terms of this License (including intervention in a third-party
     action by a Licensee).
 1.4. Disclaimers. The Reference Implementation and Specification are provided
     "AS IS" and without warranty. The entire risk as to implementing or
     otherwise using the Reference Implementation or Specification is assumed
     by the implementer and user. Licensor expressly disclaims any warranties
     (express, implied, or otherwise), including implied warranties of
     merchantability, non-infringement, fitness for a particular purpose, or
     title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
     ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
     INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
     ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
     OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 2. Definitions.
 2.1. Affiliate.  <20>Affiliate<74> means an entity that directly or indirectly
     Controls, is Controlled by, or is under common Control of that party.
 2.2. Control. <20>Control<6F> means direct or indirect control of more than 50% of
     the voting power to elect directors of that corporation, or for any other
     entity, the power to direct management of such entity.
 2.3. Decoder.  "Decoder" means any decoder that conforms fully with all
     non-optional portions of the Specification.
 2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
     be decoded by a Decoder only to the extent it produces such a bitstream.
 2.5. Final Deliverable.  <20>Final Deliverable<6C> means the final version of a
     deliverable approved by the Alliance for Open Media as a Final
     Deliverable.
 2.6. Implementation.  "Implementation" means any implementation, including the
     Reference Implementation, that is an Encoder and/or a Decoder. An
     Implementation also includes components of an Implementation only to the
     extent they are used as part of an Implementation.
 2.7. License. <20>License<73> means this license.
 2.8. Licensee. <20>Licensee<65> means any person or entity who exercises patent
     rights granted under this License.
 2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
     for sale, imports or distributes any Implementation, or (ii) a person
     or entity that has a licensing obligation to the Implementation as a
     result of its membership and/or participation in the Alliance for Open
     Media working group that developed the Specification.
 2.10. Necessary Claims.  "Necessary Claims" means all claims of patents or
      patent applications, (a) that currently or at any time in the future,
      are owned or controlled by the Licensor, and (b) (i) would be an
      Essential Claim as defined by the W3C Policy as of February 5, 2004
      (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
      as if the Specification was a W3C Recommendation; or (ii) are infringed
      by the Reference Implementation.
 2.11. Reference Implementation. <20>Reference Implementation<6F> means an Encoder
      and/or Decoder released by the Alliance for Open Media as a Final
      Deliverable.
 2.12. Specification. <20>Specification<6F> means the specification designated by
      the Alliance for Open Media as a Final Deliverable for which this
      License was issued.
 Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
 royalty-free, irrevocable (except as stated in this section) patent license to
 make, have made, use, offer to sell, sell, import, transfer, and otherwise
 run, modify and propagate the contents of these implementations of WebM, where
 such license applies only to those patent claims, both currently owned by
 Google and acquired in the future, licensable by Google that are necessarily
 infringed by these implementations of WebM. This grant does not include claims
 that would be infringed only as a consequence of further modification of these
 implementations. If you or your agent or exclusive licensee institute or order
 or agree to the institution of patent litigation or any other patent
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
 constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.
--- a/29
+++ b/29
@@ -1,6 +1,6 @@
 README - 23 March 2015
-Welcome to the WebM VP8/VP9 Codec SDK!
+Welcome to the WebM VP8/AV1 Codec SDK!
 COMPILING THE APPLICATIONS/LIBRARIES:
  The build system used is similar to autotools. Building generally consists of
@@ -33,13 +33,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    $ mkdir build
    $ cd build
-    $ ../libvpx/configure <options>
+    $ ../libaom/configure <options>
    $ make
  3. Configuration options
  The 'configure' script supports a number of options. The --help option can be
  used to get a list of supported options:
-    $ ../libvpx/configure --help
+    $ ../libaom/configure --help
  4. Cross development
  For cross development, the most notable option is the --target option. The
@@ -108,7 +108,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  toolchain, the following command could be used (note, POSIX SH syntax, adapt
  to your shell as necessary):
-    $ CROSS=mipsel-linux-uclibc- ../libvpx/configure
+    $ CROSS=mipsel-linux-uclibc- ../libaom/configure
  In addition, the executables to be invoked can be overridden by specifying the
  environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
@@ -119,13 +119,28 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  This defaults to config.log. This should give a good indication of what went
  wrong. If not, contact us for support.
-VP8/VP9 TEST VECTORS:
+VP8/AV1 TEST VECTORS:
  The test vectors can be downloaded and verified using the build system after
  running configure. To specify an alternate directory the
-  LIBVPX_TEST_DATA_PATH environment variable can be used.
+  LIBAOM_TEST_DATA_PATH environment variable can be used.
  $ ./configure --enable-unit-tests
-  $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata
+  $ LIBAOM_TEST_DATA_PATH=../-test-data make testdata
 CODE STYLE:
  The coding style used by this project is enforced with clang-format using the
  configuration contained in the .clang-format file in the root of the
  repository.
  Before pushing changes for review you can format your code with:
  # Apply clang-format to modified .c, .h and .cc files
  $ clang-format -i --style=file \
    $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc')
  Check the .clang-format file for the version used to generate it if there is
  any difference between your local formatting and the review system.
  See also: http://clang.llvm.org/docs/ClangFormat.html
 SUPPORT
  This library is an open source project supported by its community. Please
--- a/aom/aom.h
+++ b/aom/aom.h
@@ -0,0 +1,160 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\defgroup aom AOM
 * \ingroup codecs
 * AOM is aom's newest video compression algorithm that uses motion
 * compensated prediction, Discrete Cosine Transform (DCT) coding of the
 * prediction error signal and context dependent entropy coding techniques
 * based on arithmetic principles. It features:
 *  - YUV 4:2:0 image format
 *  - Macro-block based coding (16x16 luma plus two 8x8 chroma)
 *  - 1/4 (1/8) pixel accuracy motion compensated prediction
 *  - 4x4 DCT transform
 *  - 128 level linear quantizer
 *  - In loop deblocking filter
 *  - Context-based entropy coding
 *
 * @{
 */
 /*!\file
 * \brief Provides controls common to both the AOM encoder and decoder.
 */
 #ifndef AOM_AOM_H_
 #define AOM_AOM_H_
 #include "./aom_codec.h"
 #include "./aom_image.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*!\brief Control functions
 *
 * The set of macros define the control functions of AOM interface
 */
 enum aom_com_control_id {
  /*!\brief pass in an external frame into decoder to be used as reference frame
   */
  AOM_SET_REFERENCE = 1,
  AOM_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
  AOM_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
  AOM_SET_DBG_COLOR_REF_FRAME =
      4, /**< set the reference frames to color for each macroblock */
  AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
  AOM_SET_DBG_COLOR_B_MODES = 6,  /**< set which blocks modes to color */
  AOM_SET_DBG_DISPLAY_MV = 7,     /**< set which motion vector modes to draw */
  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
   * for its control ids. These should be migrated to something like the
   * AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
   */
  AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
  AOM_COMMON_CTRL_ID_MAX,
  AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
  AOM_DECODER_CTRL_ID_START = 256
 };
 /*!\brief post process flags
 *
 * The set of macros define AOM decoder post processing flags
 */
 enum aom_postproc_level {
  AOM_NOFILTERING = 0,
  AOM_DEBLOCK = 1 << 0,
  AOM_DEMACROBLOCK = 1 << 1,
  AOM_ADDNOISE = 1 << 2,
  AOM_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
  AOM_DEBUG_TXT_MBLK_MODES =
      1 << 4, /**< print macro block modes over each macro block */
  AOM_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
  AOM_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
  AOM_MFQE = 1 << 10
 };
 /*!\brief post process flags
 *
 * This define a structure that describe the post processing settings. For
 * the best objective measure (using the PSNR metric) set post_proc_flag
 * to AOM_DEBLOCK and deblocking_level to 1.
 */
 typedef struct aom_postproc_cfg {
  /*!\brief the types of post processing to be done, should be combination of
   * "aom_postproc_level" */
  int post_proc_flag;
  int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
  int noise_level; /**< the strength of additive noise, valid range [0, 16] */
 } aom_postproc_cfg_t;
 /*!\brief reference frame type
 *
 * The set of macros define the type of AOM reference frames
 */
 typedef enum aom_ref_frame_type {
  AOM_LAST_FRAME = 1,
  AOM_GOLD_FRAME = 2,
  AOM_ALTR_FRAME = 4
 } aom_ref_frame_type_t;
 /*!\brief reference frame data struct
 *
 * Define the data struct to access aom reference frames.
 */
 typedef struct aom_ref_frame {
  aom_ref_frame_type_t frame_type; /**< which reference frame */
  aom_image_t img;                 /**< reference frame data in image format */
 } aom_ref_frame_t;
 /*!\brief AV1 specific reference frame data struct
 *
 * Define the data struct to access av1 reference frames.
 */
 typedef struct av1_ref_frame {
  int idx;         /**< frame index to get (input) */
  aom_image_t img; /**< img structure to populate (output) */
 } av1_ref_frame_t;
 /*!\cond */
 /*!\brief aom decoder control function parameter type
 *
 * defines the data type for each of AOM decoder control function requires
 */
 AOM_CTRL_USE_TYPE(AOM_SET_REFERENCE, aom_ref_frame_t *)
 #define AOM_CTRL_AOM_SET_REFERENCE
 AOM_CTRL_USE_TYPE(AOM_COPY_REFERENCE, aom_ref_frame_t *)
 #define AOM_CTRL_AOM_COPY_REFERENCE
 AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
 #define AOM_CTRL_AOM_SET_POSTPROC
 AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
 #define AOM_CTRL_AOM_SET_DBG_COLOR_REF_FRAME
 AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_MB_MODES, int)
 #define AOM_CTRL_AOM_SET_DBG_COLOR_MB_MODES
 AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_B_MODES, int)
 #define AOM_CTRL_AOM_SET_DBG_COLOR_B_MODES
 AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
 #define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
 AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
 #define AOM_CTRL_AV1_GET_REFERENCE
 AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
 #define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
 /*!\endcond */
 /*! @} - end defgroup aom */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_AOM_H_
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h
@@ -0,0 +1,487 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\defgroup codec Common Algorithm Interface
 * This abstraction allows applications to easily support multiple video
 * formats with minimal code duplication. This section describes the interface
 * common to all codecs (both encoders and decoders).
 * @{
 */
 /*!\file
 * \brief Describes the codec algorithm interface to applications.
 *
 * This file describes the interface between an application and a
 * video codec algorithm.
 *
 * An application instantiates a specific codec instance by using
 * aom_codec_init() and a pointer to the algorithm's interface structure:
 *     <pre>
 *     my_app.c:
 *       extern aom_codec_iface_t my_codec;
 *       {
 *           aom_codec_ctx_t algo;
 *           res = aom_codec_init(&algo, &my_codec);
 *       }
 *     </pre>
 *
 * Once initialized, the instance is manged using other functions from
 * the aom_codec_* family.
 */
 #ifndef AOM_AOM_CODEC_H_
 #define AOM_AOM_CODEC_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "./aom_integer.h"
 #include "./aom_image.h"
 /*!\brief Decorator indicating a function is deprecated */
 #ifndef DEPRECATED
 #if defined(__GNUC__) && __GNUC__
 #define DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
 #define DEPRECATED
 #else
 #define DEPRECATED
 #endif
 #endif /* DEPRECATED */
 #ifndef DECLSPEC_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
 #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #elif defined(_MSC_VER)
 /*!\brief \copydoc #DEPRECATED */
 #define DECLSPEC_DEPRECATED __declspec(deprecated)
 #else
 #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #endif
 #endif /* DECLSPEC_DEPRECATED */
 /*!\brief Decorator indicating a function is potentially unused */
 #ifdef UNUSED
 #elif defined(__GNUC__) || defined(__clang__)
 #define UNUSED __attribute__((unused))
 #else
 #define UNUSED
 #endif
 /*!\brief Decorator indicating that given struct/union/enum is packed */
 #ifndef ATTRIBUTE_PACKED
 #if defined(__GNUC__) && __GNUC__
 #define ATTRIBUTE_PACKED __attribute__((packed))
 #elif defined(_MSC_VER)
 #define ATTRIBUTE_PACKED
 #else
 #define ATTRIBUTE_PACKED
 #endif
 #endif /* ATTRIBUTE_PACKED */
 /*!\brief Current ABI version number
 *
 * \internal
 * If this file is altered in any way that changes the ABI, this value
 * must be bumped.  Examples include, but are not limited to, changing
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
 #define AOM_CODEC_ABI_VERSION (3 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 /*!\brief Algorithm return codes */
 typedef enum {
  /*!\brief Operation completed without error */
  AOM_CODEC_OK,
  /*!\brief Unspecified error */
  AOM_CODEC_ERROR,
  /*!\brief Memory operation failed */
  AOM_CODEC_MEM_ERROR,
  /*!\brief ABI version mismatch */
  AOM_CODEC_ABI_MISMATCH,
  /*!\brief Algorithm does not have required capability */
  AOM_CODEC_INCAPABLE,
  /*!\brief The given bitstream is not supported.
   *
   * The bitstream was unable to be parsed at the highest level. The decoder
   * is unable to proceed. This error \ref SHOULD be treated as fatal to the
   * stream. */
  AOM_CODEC_UNSUP_BITSTREAM,
  /*!\brief Encoded bitstream uses an unsupported feature
   *
   * The decoder does not implement a feature required by the encoder. This
   * return code should only be used for features that prevent future
   * pictures from being properly decoded. This error \ref MAY be treated as
   * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
   */
  AOM_CODEC_UNSUP_FEATURE,
  /*!\brief The coded data for this stream is corrupt or incomplete
   *
   * There was a problem decoding the current frame.  This return code
   * should only be used for failures that prevent future pictures from
   * being properly decoded. This error \ref MAY be treated as fatal to the
   * stream or \ref MAY be treated as fatal to the current GOP. If decoding
   * is continued for the current GOP, artifacts may be present.
   */
  AOM_CODEC_CORRUPT_FRAME,
  /*!\brief An application-supplied parameter is not valid.
   *
   */
  AOM_CODEC_INVALID_PARAM,
  /*!\brief An iterator reached the end of list.
   *
   */
  AOM_CODEC_LIST_END
 } aom_codec_err_t;
 /*! \brief Codec capabilities bitfield
 *
 *  Each codec advertises the capabilities it supports as part of its
 *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
 *  or functionality, and are not required to be supported.
 *
 *  The available flags are specified by AOM_CODEC_CAP_* defines.
 */
 typedef long aom_codec_caps_t;
 #define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
 #define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
 /*! \brief Initialization-time Feature Enabling
 *
 *  Certain codec features must be known at initialization time, to allow for
 *  proper memory allocation.
 *
 *  The available flags are specified by AOM_CODEC_USE_* defines.
 */
 typedef long aom_codec_flags_t;
 /*!\brief Codec interface structure.
 *
 * Contains function pointers and other data private to the codec
 * implementation. This structure is opaque to the application.
 */
 typedef const struct aom_codec_iface aom_codec_iface_t;
 /*!\brief Codec private data structure.
 *
 * Contains data private to the codec implementation. This structure is opaque
 * to the application.
 */
 typedef struct aom_codec_priv aom_codec_priv_t;
 /*!\brief Iterator
 *
 * Opaque storage used for iterating over lists.
 */
 typedef const void *aom_codec_iter_t;
 /*!\brief Codec context structure
 *
 * All codecs \ref MUST support this context structure fully. In general,
 * this data should be considered private to the codec algorithm, and
 * not be manipulated or examined by the calling application. Applications
 * may reference the 'name' member to get a printable description of the
 * algorithm.
 */
 typedef struct aom_codec_ctx {
  const char *name;             /**< Printable interface name */
  aom_codec_iface_t *iface;     /**< Interface pointers */
  aom_codec_err_t err;          /**< Last returned error */
  const char *err_detail;       /**< Detailed info, if available */
  aom_codec_flags_t init_flags; /**< Flags passed at init time */
  union {
    /**< Decoder Configuration Pointer */
    const struct aom_codec_dec_cfg *dec;
    /**< Encoder Configuration Pointer */
    const struct aom_codec_enc_cfg *enc;
    const void *raw;
  } config;               /**< Configuration pointer aliasing union */
  aom_codec_priv_t *priv; /**< Algorithm private storage */
 } aom_codec_ctx_t;
 /*!\brief Bit depth for codec
 * *
 * This enumeration determines the bit depth of the codec.
 */
 typedef enum aom_bit_depth {
  AOM_BITS_8 = 8,   /**<  8 bits */
  AOM_BITS_10 = 10, /**< 10 bits */
  AOM_BITS_12 = 12, /**< 12 bits */
 } aom_bit_depth_t;
 /*!\brief Superblock size selection.
 *
 * Defines the superblock size used for encoding. The superblock size can
 * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
 * selected by the encoder for each frame.
 */
 typedef enum aom_superblock_size {
  AOM_SUPERBLOCK_SIZE_64X64,   /**< Always use 64x64 superblocks. */
  AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
  AOM_SUPERBLOCK_SIZE_DYNAMIC  /**< Select superblock size dynamically. */
 } aom_superblock_size_t;
 /*
 * Library Version Number Interface
 *
 * For example, see the following sample return values:
 *     aom_codec_version()           (1<<16 | 2<<8 | 3)
 *     aom_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
 *     aom_codec_version_extra_str() "rc1-16-gec6a1ba"
 */
 /*!\brief Return the version information (as an integer)
 *
 * Returns a packed encoding of the library version number. This will only
 * include
 * the major.minor.patch component of the version number. Note that this encoded
 * value should be accessed through the macros provided, as the encoding may
 * change
 * in the future.
 *
 */
 int aom_codec_version(void);
 #define AOM_VERSION_MAJOR(v) \
  ((v >> 16) & 0xff) /**< extract major from packed version */
 #define AOM_VERSION_MINOR(v) \
  ((v >> 8) & 0xff) /**< extract minor from packed version */
 #define AOM_VERSION_PATCH(v) \
  ((v >> 0) & 0xff) /**< extract patch from packed version */
 /*!\brief Return the version major number */
 #define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
 /*!\brief Return the version minor number */
 #define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)
 /*!\brief Return the version patch number */
 #define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)
 /*!\brief Return the version information (as a string)
 *
 * Returns a printable string containing the full library version number. This
 * may
 * contain additional text following the three digit version number, as to
 * indicate
 * release candidates, prerelease versions, etc.
 *
 */
 const char *aom_codec_version_str(void);
 /*!\brief Return the version information (as a string)
 *
 * Returns a printable "extra string". This is the component of the string
 * returned
 * by aom_codec_version_str() following the three digit version number.
 *
 */
 const char *aom_codec_version_extra_str(void);
 /*!\brief Return the build configuration
 *
 * Returns a printable string containing an encoded version of the build
 * configuration. This may be useful to aom support.
 *
 */
 const char *aom_codec_build_config(void);
 /*!\brief Return the name for a given interface
 *
 * Returns a human readable string for name of the given codec interface.
 *
 * \param[in]    iface     Interface pointer
 *
 */
 const char *aom_codec_iface_name(aom_codec_iface_t *iface);
 /*!\brief Convert error number to printable string
 *
 * Returns a human readable string for the last error returned by the
 * algorithm. The returned error will be one line and will not contain
 * any newline characters.
 *
 *
 * \param[in]    err     Error number.
 *
 */
 const char *aom_codec_err_to_string(aom_codec_err_t err);
 /*!\brief Retrieve error synopsis for codec context
 *
 * Returns a human readable string for the last error returned by the
 * algorithm. The returned error will be one line and will not contain
 * any newline characters.
 *
 *
 * \param[in]    ctx     Pointer to this instance's context.
 *
 */
 const char *aom_codec_error(aom_codec_ctx_t *ctx);
 /*!\brief Retrieve detailed error information for codec context
 *
 * Returns a human readable string providing detailed information about
 * the last error.
 *
 * \param[in]    ctx     Pointer to this instance's context.
 *
 * \retval NULL
 *     No detailed information is available.
 */
 const char *aom_codec_error_detail(aom_codec_ctx_t *ctx);
 /* REQUIRED FUNCTIONS
 *
 * The following functions are required to be implemented for all codecs.
 * They represent the base case functionality expected of all codecs.
 */
 /*!\brief Destroy a codec instance
 *
 * Destroys a codec context, freeing any associated memory buffers.
 *
 * \param[in] ctx   Pointer to this instance's context
 *
 * \retval #AOM_CODEC_OK
 *     The codec algorithm initialized.
 * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
 aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx);
 /*!\brief Get the capabilities of an algorithm.
 *
 * Retrieves the capabilities bitfield from the algorithm's interface.
 *
 * \param[in] iface   Pointer to the algorithm interface
 *
 */
 aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);
 /*!\brief Control algorithm
 *
 * This function is used to exchange algorithm specific data with the codec
 * instance. This can be used to implement features specific to a particular
 * algorithm.
 *
 * This wrapper function dispatches the request to the helper function
 * associated with the given ctrl_id. It tries to call this function
 * transparently, but will return #AOM_CODEC_ERROR if the request could not
 * be dispatched.
 *
 * Note that this function should not be used directly. Call the
 * #aom_codec_control wrapper macro instead.
 *
 * \param[in]     ctx              Pointer to this instance's context
 * \param[in]     ctrl_id          Algorithm specific control identifier
 *
 * \retval #AOM_CODEC_OK
 *     The control request was processed.
 * \retval #AOM_CODEC_ERROR
 *     The control request was not processed.
 * \retval #AOM_CODEC_INVALID_PARAM
 *     The data was not valid.
 */
 aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
 #if defined(AOM_DISABLE_CTRL_TYPECHECKS) && AOM_DISABLE_CTRL_TYPECHECKS
 #define aom_codec_control(ctx, id, data) aom_codec_control_(ctx, id, data)
 #define AOM_CTRL_USE_TYPE(id, typ)
 #define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)
 #define AOM_CTRL_VOID(id, typ)
 #else
 /*!\brief aom_codec_control wrapper macro
 *
 * This macro allows for type safe conversions across the variadic parameter
 * to aom_codec_control_().
 *
 * \internal
 * It works by dispatching the call to the control function through a wrapper
 * function named with the id parameter.
 */
 #define aom_codec_control(ctx, id, data) \
  aom_codec_control_##id(ctx, id, data) /**<\hideinitializer*/
 /*!\brief aom_codec_control type definition macro
 *
 * This macro allows for type safe conversions across the variadic parameter
 * to aom_codec_control_(). It defines the type of the argument for a given
 * control identifier.
 *
 * \internal
 * It defines a static function with
 * the correctly typed arguments as a wrapper to the type-unsafe internal
 * function.
 */
 #define AOM_CTRL_USE_TYPE(id, typ)                                           \
  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int, typ) \
      UNUSED;                                                                \
                                                                             \
  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,        \
                                                int ctrl_id, typ data) {     \
    return aom_codec_control_(ctx, ctrl_id, data);                           \
  } /**<\hideinitializer*/
 /*!\brief aom_codec_control deprecated type definition macro
 *
 * Like #AOM_CTRL_USE_TYPE, but indicates that the specified control is
 * deprecated and should not be used. Consult the documentation for your
 * codec for more information.
 *
 * \internal
 * It defines a static function with the correctly typed arguments as a
 * wrapper to the type-unsafe internal function.
 */
 #define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)                        \
  DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
      aom_codec_ctx_t *, int, typ) DEPRECATED UNUSED;                \
                                                                     \
  DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
      aom_codec_ctx_t *ctx, int ctrl_id, typ data) {                 \
    return aom_codec_control_(ctx, ctrl_id, data);                   \
  } /**<\hideinitializer*/
 /*!\brief aom_codec_control void type definition macro
 *
 * This macro allows for type safe conversions across the variadic parameter
 * to aom_codec_control_(). It indicates that a given control identifier takes
 * no argument.
 *
 * \internal
 * It defines a static function without a data argument as a wrapper to the
 * type-unsafe internal function.
 */
 #define AOM_CTRL_VOID(id)                                               \
  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int) \
      UNUSED;                                                           \
                                                                        \
  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,   \
                                                int ctrl_id) {          \
    return aom_codec_control_(ctx, ctrl_id);                            \
  } /**<\hideinitializer*/
 #endif
 /*!@} - end defgroup codec*/
 #ifdef __cplusplus
 }
 #endif
 #endif  // AOM_AOM_CODEC_H_
--- a/aom/aom_codec.mk
+++ b/aom/aom_codec.mk
@@ -0,0 +1,42 @@
 ##
 ## Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ##
 ## This source code is subject to the terms of the BSD 2 Clause License and
 ## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 ## was not distributed with this source code in the LICENSE file, you can
 ## obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ## Media Patent License 1.0 was not distributed with this source code in the
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 API_EXPORTS += exports
 API_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
 API_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
 API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
 API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
 API_SRCS-$(CONFIG_AV1_DECODER) += aom.h
 API_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
 API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aom.h
 API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
 API_DOC_SRCS-yes += aom_codec.h
 API_DOC_SRCS-yes += aom_decoder.h
 API_DOC_SRCS-yes += aom_encoder.h
 API_DOC_SRCS-yes += aom_frame_buffer.h
 API_DOC_SRCS-yes += aom_image.h
 API_SRCS-yes += src/aom_decoder.c
 API_SRCS-yes += aom_decoder.h
 API_SRCS-yes += src/aom_encoder.c
 API_SRCS-yes += aom_encoder.h
 API_SRCS-yes += internal/aom_codec_internal.h
 API_SRCS-yes += src/aom_codec.c
 API_SRCS-yes += src/aom_image.c
 API_SRCS-yes += aom_codec.h
 API_SRCS-yes += aom_codec.mk
 API_SRCS-yes += aom_frame_buffer.h
 API_SRCS-yes += aom_image.h
 API_SRCS-yes += aom_integer.h
--- a/aom/aom_decoder.h
+++ b/aom/aom_decoder.h
@@ -0,0 +1,366 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_AOM_DECODER_H_
 #define AOM_AOM_DECODER_H_
 /*!\defgroup decoder Decoder Algorithm Interface
 * \ingroup codec
 * This abstraction allows applications using this decoder to easily support
 * multiple video formats with minimal code duplication. This section describes
 * the interface common to all decoders.
 * @{
 */
 /*!\file
 * \brief Describes the decoder algorithm interface to applications.
 *
 * This file describes the interface between an application and a
 * video decoder algorithm.
 *
 */
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "./aom_codec.h"
 #include "./aom_frame_buffer.h"
 /*!\brief Current ABI version number
 *
 * \internal
 * If this file is altered in any way that changes the ABI, this value
 * must be bumped.  Examples include, but are not limited to, changing
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
 #define AOM_DECODER_ABI_VERSION \
  (3 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
 /*! \brief Decoder capabilities bitfield
 *
 *  Each decoder advertises the capabilities it supports as part of its
 *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
 *  or functionality, and are not required to be supported by a decoder.
 *
 *  The available flags are specified by AOM_CODEC_CAP_* defines.
 */
 #define AOM_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
 #define AOM_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
 #define AOM_CODEC_CAP_POSTPROC 0x40000  /**< Can postprocess decoded frame */
 /*!\brief Can conceal errors due to packet loss */
 #define AOM_CODEC_CAP_ERROR_CONCEALMENT 0x80000
 /*!\brief Can receive encoded frames one fragment at a time */
 #define AOM_CODEC_CAP_INPUT_FRAGMENTS 0x100000
 /*! \brief Initialization-time Feature Enabling
 *
 *  Certain codec features must be known at initialization time, to allow for
 *  proper memory allocation.
 *
 *  The available flags are specified by AOM_CODEC_USE_* defines.
 */
 /*!\brief Can support frame-based multi-threading */
 #define AOM_CODEC_CAP_FRAME_THREADING 0x200000
 /*!brief Can support external frame buffers */
 #define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
 #define AOM_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
 /*!\brief Conceal errors in decoded frames */
 #define AOM_CODEC_USE_ERROR_CONCEALMENT 0x20000
 /*!\brief The input frame should be passed to the decoder one fragment at a
 * time */
 #define AOM_CODEC_USE_INPUT_FRAGMENTS 0x40000
 /*!\brief Enable frame-based multi-threading */
 #define AOM_CODEC_USE_FRAME_THREADING 0x80000
 /*!\brief Stream properties
 *
 * This structure is used to query or set properties of the decoded
 * stream. Algorithms may extend this structure with data specific
 * to their bitstream by setting the sz member appropriately.
 */
 typedef struct aom_codec_stream_info {
  unsigned int sz;    /**< Size of this structure */
  unsigned int w;     /**< Width (or 0 for unknown/default) */
  unsigned int h;     /**< Height (or 0 for unknown/default) */
  unsigned int is_kf; /**< Current frame is a keyframe */
 } aom_codec_stream_info_t;
 /* REQUIRED FUNCTIONS
 *
 * The following functions are required to be implemented for all decoders.
 * They represent the base case functionality expected of all decoders.
 */
 /*!\brief Initialization Configurations
 *
 * This structure is used to pass init time configuration options to the
 * decoder.
 */
 typedef struct aom_codec_dec_cfg {
  unsigned int threads; /**< Maximum number of threads to use, default 1 */
  unsigned int w;       /**< Width */
  unsigned int h;       /**< Height */
 } aom_codec_dec_cfg_t;  /**< alias for struct aom_codec_dec_cfg */
 /*!\brief Initialize a decoder instance
 *
 * Initializes a decoder context using the given interface. Applications
 * should call the aom_codec_dec_init convenience macro instead of this
 * function directly, to ensure that the ABI version number parameter
 * is properly initialized.
 *
 * If the library was configured with --disable-multithread, this call
 * is not thread safe and should be guarded with a lock if being used
 * in a multithreaded context.
 *
 * \param[in]    ctx     Pointer to this instance's context.
 * \param[in]    iface   Pointer to the algorithm interface to use.
 * \param[in]    cfg     Configuration to use, if known. May be NULL.
 * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    ver     ABI version number. Must be set to
 *                       AOM_DECODER_ABI_VERSION
 * \retval #AOM_CODEC_OK
 *     The decoder algorithm initialized.
 * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
 aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
                                       aom_codec_iface_t *iface,
                                       const aom_codec_dec_cfg_t *cfg,
                                       aom_codec_flags_t flags, int ver);
 /*!\brief Convenience macro for aom_codec_dec_init_ver()
 *
 * Ensures the ABI version parameter is properly set.
 */
 #define aom_codec_dec_init(ctx, iface, cfg, flags) \
  aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION)
 /*!\brief Parse stream info from a buffer
 *
 * Performs high level parsing of the bitstream. Construction of a decoder
 * context is not necessary. Can be used to determine if the bitstream is
 * of the proper format, and to extract information from the stream.
 *
 * \param[in]      iface   Pointer to the algorithm interface
 * \param[in]      data    Pointer to a block of data to parse
 * \param[in]      data_sz Size of the data buffer
 * \param[in,out]  si      Pointer to stream info to update. The size member
 *                         \ref MUST be properly initialized, but \ref MAY be
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
 * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
 aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
                                           const uint8_t *data,
                                           unsigned int data_sz,
                                           aom_codec_stream_info_t *si);
 /*!\brief Return information about the current stream.
 *
 * Returns information about the stream that has been parsed during decoding.
 *
 * \param[in]      ctx     Pointer to this instance's context
 * \param[in,out]  si      Pointer to stream info to update. The size member
 *                         \ref MUST be properly initialized, but \ref MAY be
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
 * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
 aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
                                          aom_codec_stream_info_t *si);
 /*!\brief Decode data
 *
 * Processes a buffer of coded data. If the processing results in a new
 * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
 * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
 * time stamp) order. Frames produced will always be in PTS (presentation
 * time stamp) order.
 * If the decoder is configured with AOM_CODEC_USE_INPUT_FRAGMENTS enabled,
 * data and data_sz can contain a fragment of the encoded frame. Fragment
 * \#n must contain at least partition \#n, but can also contain subsequent
 * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
 * be empty. When no more data is available, this function should be called
 * with NULL as data and 0 as data_sz. The memory passed to this function
 * must be available until the frame has been decoded.
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] data         Pointer to this block of new coded data. If
 *                         NULL, a AOM_CODEC_CB_PUT_FRAME event is posted
 *                         for the previously decoded frame.
 * \param[in] data_sz      Size of the coded data, in bytes.
 * \param[in] user_priv    Application specific data to associate with
 *                         this frame.
 * \param[in] deadline     Soft deadline the decoder should attempt to meet,
 *                         in us. Set to zero for unlimited.
 *
 * \return Returns #AOM_CODEC_OK if the coded data was processed completely
 *         and future pictures can be decoded without error. Otherwise,
 *         see the descriptions of the other error codes in ::aom_codec_err_t
 *         for recoverability capabilities.
 */
 aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
                                 unsigned int data_sz, void *user_priv,
                                 long deadline);
 /*!\brief Decoded frames iterator
 *
 * Iterates over a list of the frames available for display. The iterator
 * storage should be initialized to NULL to start the iteration. Iteration is
 * complete when this function returns NULL.
 *
 * The list of available frames becomes valid upon completion of the
 * aom_codec_decode call, and remains valid until the next call to
 * aom_codec_decode.
 *
 * \param[in]     ctx      Pointer to this instance's context
 * \param[in,out] iter     Iterator storage, initialized to NULL
 *
 * \return Returns a pointer to an image, if one is ready for display. Frames
 *         produced will always be in PTS (presentation time stamp) order.
 */
 aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter);
 /*!\defgroup cap_put_frame Frame-Based Decoding Functions
 *
 * The following functions are required to be implemented for all decoders
 * that advertise the AOM_CODEC_CAP_PUT_FRAME capability. Calling these
 * functions
 * for codecs that don't advertise this capability will result in an error
 * code being returned, usually AOM_CODEC_ERROR
 * @{
 */
 /*!\brief put frame callback prototype
 *
 * This callback is invoked by the decoder to notify the application of
 * the availability of decoded image data.
 */
 typedef void (*aom_codec_put_frame_cb_fn_t)(void *user_priv,
                                            const aom_image_t *img);
 /*!\brief Register for notification of frame completion.
 *
 * Registers a given function to be called when a decoded frame is
 * available.
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] cb           Pointer to the callback function
 * \param[in] user_priv    User's private data
 *
 * \retval #AOM_CODEC_OK
 *     Callback successfully registered.
 * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     posting slice completion.
 */
 aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
                                                aom_codec_put_frame_cb_fn_t cb,
                                                void *user_priv);
 /*!@} - end defgroup cap_put_frame */
 /*!\defgroup cap_put_slice Slice-Based Decoding Functions
 *
 * The following functions are required to be implemented for all decoders
 * that advertise the AOM_CODEC_CAP_PUT_SLICE capability. Calling these
 * functions
 * for codecs that don't advertise this capability will result in an error
 * code being returned, usually AOM_CODEC_ERROR
 * @{
 */
 /*!\brief put slice callback prototype
 *
 * This callback is invoked by the decoder to notify the application of
 * the availability of partially decoded image data. The
 */
 typedef void (*aom_codec_put_slice_cb_fn_t)(void *user_priv,
                                            const aom_image_t *img,
                                            const aom_image_rect_t *valid,
                                            const aom_image_rect_t *update);
 /*!\brief Register for notification of slice completion.
 *
 * Registers a given function to be called when a decoded slice is
 * available.
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] cb           Pointer to the callback function
 * \param[in] user_priv    User's private data
 *
 * \retval #AOM_CODEC_OK
 *     Callback successfully registered.
 * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     posting slice completion.
 */
 aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
                                                aom_codec_put_slice_cb_fn_t cb,
                                                void *user_priv);
 /*!@} - end defgroup cap_put_slice*/
 /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
 *
 * The following section is required to be implemented for all decoders
 * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
 * Calling this function for codecs that don't advertise this capability
 * will result in an error code being returned, usually AOM_CODEC_ERROR.
 *
 * \note
 * Currently this only works with AV1.
 * @{
 */
 /*!\brief Pass in external frame buffers for the decoder to use.
 *
 * Registers functions to be called when libaom needs a frame buffer
 * to decode the current frame and a function to be called when libaom does
 * not internally reference the frame buffer. This set function must
 * be called before the first call to decode or libaom will assume the
 * default behavior of allocating frame buffers internally.
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] cb_get       Pointer to the get callback function
 * \param[in] cb_release   Pointer to the release callback function
 * \param[in] cb_priv      Callback's private data
 *
 * \retval #AOM_CODEC_OK
 *     External frame buffers will be used by libaom.
 * \retval #AOM_CODEC_INVALID_PARAM
 *     One or more of the callbacks were NULL.
 * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     using external frame buffers.
 *
 * \note
 * When decoding AV1, the application may be required to pass in at least
 * #AOM_MAXIMUM_WORK_BUFFERS external frame
 * buffers.
 */
 aom_codec_err_t aom_codec_set_frame_buffer_functions(
    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
 /*!@} - end defgroup cap_external_frame_buffer */
 /*!@} - end defgroup decoder*/
 #ifdef __cplusplus
 }
 #endif
 #endif  // AOM_AOM_DECODER_H_
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h
@@ -0,0 +1,837 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_AOM_ENCODER_H_
 #define AOM_AOM_ENCODER_H_
 /*!\defgroup encoder Encoder Algorithm Interface
 * \ingroup codec
 * This abstraction allows applications using this encoder to easily support
 * multiple video formats with minimal code duplication. This section describes
 * the interface common to all encoders.
 * @{
 */
 /*!\file
 * \brief Describes the encoder algorithm interface to applications.
 *
 * This file describes the interface between an application and a
 * video encoder algorithm.
 *
 */
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "./aom_codec.h"
 /*!\brief Current ABI version number
 *
 * \internal
 * If this file is altered in any way that changes the ABI, this value
 * must be bumped.  Examples include, but are not limited to, changing
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
 #define AOM_ENCODER_ABI_VERSION \
  (5 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
 /*! \brief Encoder capabilities bitfield
 *
 *  Each encoder advertises the capabilities it supports as part of its
 *  ::aom_codec_iface_t interface structure. Capabilities are extra
 *  interfaces or functionality, and are not required to be supported
 *  by an encoder.
 *
 *  The available flags are specified by AOM_CODEC_CAP_* defines.
 */
 #define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
 /*! Can output one partition at a time. Each partition is returned in its
 *  own AOM_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
 *  every partition but the last. In this mode all frames are always
 *  returned partition by partition.
 */
 #define AOM_CODEC_CAP_OUTPUT_PARTITION 0x20000
 /*! Can support input images at greater than 8 bitdepth.
 */
 #define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000
 /*! \brief Initialization-time Feature Enabling
 *
 *  Certain codec features must be known at initialization time, to allow
 *  for proper memory allocation.
 *
 *  The available flags are specified by AOM_CODEC_USE_* defines.
 */
 #define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
 /*!\brief Make the encoder output one  partition at a time. */
 #define AOM_CODEC_USE_OUTPUT_PARTITION 0x20000
 #define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
 /*!\brief Generic fixed size buffer structure
 *
 * This structure is able to hold a reference to any fixed size buffer.
 */
 typedef struct aom_fixed_buf {
  void *buf;       /**< Pointer to the data */
  size_t sz;       /**< Length of the buffer, in chars */
 } aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
 /*!\brief Time Stamp Type
 *
 * An integer, which when multiplied by the stream's time base, provides
 * the absolute time of a sample.
 */
 typedef int64_t aom_codec_pts_t;
 /*!\brief Compressed Frame Flags
 *
 * This type represents a bitfield containing information about a compressed
 * frame that may be useful to an application. The most significant 16 bits
 * can be used by an algorithm to provide additional detail, for example to
 * support frame types that are codec specific (MPEG-1 D-frames for example)
 */
 typedef uint32_t aom_codec_frame_flags_t;
 #define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
 /*!\brief frame can be dropped without affecting the stream (no future frame
 * depends on this one) */
 #define AOM_FRAME_IS_DROPPABLE 0x2
 /*!\brief frame should be decoded but will not be shown */
 #define AOM_FRAME_IS_INVISIBLE 0x4
 /*!\brief this is a fragment of the encoded frame */
 #define AOM_FRAME_IS_FRAGMENT 0x8
 /*!\brief Error Resilient flags
 *
 * These flags define which error resilient features to enable in the
 * encoder. The flags are specified through the
 * aom_codec_enc_cfg::g_error_resilient variable.
 */
 typedef uint32_t aom_codec_er_flags_t;
 /*!\brief Improve resiliency against losses of whole frames */
 #define AOM_ERROR_RESILIENT_DEFAULT 0x1
 /*!\brief The frame partitions are independently decodable by the bool decoder,
 * meaning that partitions can be decoded even though earlier partitions have
 * been lost. Note that intra prediction is still done over the partition
 * boundary. */
 #define AOM_ERROR_RESILIENT_PARTITIONS 0x2
 /*!\brief Encoder output packet variants
 *
 * This enumeration lists the different kinds of data packets that can be
 * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY
 * extend this list to provide additional functionality.
 */
 enum aom_codec_cx_pkt_kind {
  AOM_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
  AOM_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
  AOM_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
  AOM_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
  AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
 };
 /*!\brief Encoder output packet
 *
 * This structure contains the different kinds of output data the encoder
 * may produce while compressing a frame.
 */
 typedef struct aom_codec_cx_pkt {
  enum aom_codec_cx_pkt_kind kind; /**< packet variant */
  union {
    struct {
      void *buf; /**< compressed data buffer */
      size_t sz; /**< length of compressed data */
      /*!\brief time stamp to show frame (in timebase units) */
      aom_codec_pts_t pts;
      /*!\brief duration to show frame (in timebase units) */
      unsigned long duration;
      aom_codec_frame_flags_t flags; /**< flags for this frame */
      /*!\brief the partition id defines the decoding order of the partitions.
       * Only applicable when "output partition" mode is enabled. First
       * partition has id 0.*/
      int partition_id;
    } frame;                            /**< data for compressed frame packet */
    aom_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
    aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
    struct aom_psnr_pkt {
      unsigned int samples[4]; /**< Number of samples, total/y/u/v */
      uint64_t sse[4];         /**< sum squared error, total/y/u/v */
      double psnr[4];          /**< PSNR, total/y/u/v */
    } psnr;                    /**< data for PSNR packet */
    aom_fixed_buf_t raw;       /**< data for arbitrary packets */
    /* This packet size is fixed to allow codecs to extend this
     * interface without having to manage storage for raw packets,
     * i.e., if it's smaller than 128 bytes, you can store in the
     * packet list directly.
     */
    char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */
  } data;                                               /**< packet data */
 } aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
 /*!\brief Rational Number
 *
 * This structure holds a fractional value.
 */
 typedef struct aom_rational {
  int num;        /**< fraction numerator */
  int den;        /**< fraction denominator */
 } aom_rational_t; /**< alias for struct aom_rational */
 /*!\brief Multi-pass Encoding Pass */
 enum aom_enc_pass {
  AOM_RC_ONE_PASS,   /**< Single pass mode */
  AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */
  AOM_RC_LAST_PASS   /**< Final pass of multi-pass mode */
 };
 /*!\brief Rate control mode */
 enum aom_rc_mode {
  AOM_VBR, /**< Variable Bit Rate (VBR) mode */
  AOM_CBR, /**< Constant Bit Rate (CBR) mode */
  AOM_CQ,  /**< Constrained Quality (CQ)  mode */
  AOM_Q,   /**< Constant Quality (Q) mode */
 };
 /*!\brief Keyframe placement mode.
 *
 * This enumeration determines whether keyframes are placed automatically by
 * the encoder or whether this behavior is disabled. Older releases of this
 * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled.
 * This name is confusing for this behavior, so the new symbols to be used
 * are AOM_KF_AUTO and AOM_KF_DISABLED.
 */
 enum aom_kf_mode {
  AOM_KF_FIXED,       /**< deprecated, implies AOM_KF_DISABLED */
  AOM_KF_AUTO,        /**< Encoder determines optimal placement automatically */
  AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
 };
 /*!\brief Encoded Frame Flags
 *
 * This type indicates a bitfield to be passed to aom_codec_encode(), defining
 * per-frame boolean values. By convention, bits common to all codecs will be
 * named AOM_EFLAG_*, and bits specific to an algorithm will be named
 * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
 */
 typedef long aom_enc_frame_flags_t;
 #define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
 /*!\brief Encoder configuration structure
 *
 * This structure contains the encoder settings that have common representations
 * across all codecs. This doesn't imply that all codecs support all features,
 * however.
 */
 typedef struct aom_codec_enc_cfg {
  /*
   * generic settings (g)
   */
  /*!\brief Algorithm specific "usage" value
   *
   * Algorithms may define multiple values for usage, which may convey the
   * intent of how the application intends to use the stream. If this value
   * is non-zero, consult the documentation for the codec to determine its
   * meaning.
   */
  unsigned int g_usage;
  /*!\brief Maximum number of threads to use
   *
   * For multi-threaded implementations, use no more than this number of
   * threads. The codec may use fewer threads than allowed. The value
   * 0 is equivalent to the value 1.
   */
  unsigned int g_threads;
  /*!\brief Bitstream profile to use
   *
   * Some codecs support a notion of multiple bitstream profiles. Typically
   * this maps to a set of features that are turned on or off. Often the
   * profile to use is determined by the features of the intended decoder.
   * Consult the documentation for the codec to determine the valid values
   * for this parameter, or set to zero for a sane default.
   */
  unsigned int g_profile; /**< profile of bitstream to use */
  /*!\brief Width of the frame
   *
   * This value identifies the presentation resolution of the frame,
   * in pixels. Note that the frames passed as input to the encoder must
   * have this resolution. Frames will be presented by the decoder in this
   * resolution, independent of any spatial resampling the encoder may do.
   */
  unsigned int g_w;
  /*!\brief Height of the frame
   *
   * This value identifies the presentation resolution of the frame,
   * in pixels. Note that the frames passed as input to the encoder must
   * have this resolution. Frames will be presented by the decoder in this
   * resolution, independent of any spatial resampling the encoder may do.
   */
  unsigned int g_h;
  /*!\brief Bit-depth of the codec
   *
   * This value identifies the bit_depth of the codec,
   * Only certain bit-depths are supported as identified in the
   * aom_bit_depth_t enum.
   */
  aom_bit_depth_t g_bit_depth;
  /*!\brief Bit-depth of the input frames
   *
   * This value identifies the bit_depth of the input frames in bits.
   * Note that the frames passed as input to the encoder must have
   * this bit-depth.
   */
  unsigned int g_input_bit_depth;
  /*!\brief Stream timebase units
   *
   * Indicates the smallest interval of time, in seconds, used by the stream.
   * For fixed frame rate material, or variable frame rate material where
   * frames are timed at a multiple of a given clock (ex: video capture),
   * the \ref RECOMMENDED method is to set the timebase to the reciprocal
   * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
   * pts to correspond to the frame number, which can be handy. For
   * re-encoding video from containers with absolute time timestamps, the
   * \ref RECOMMENDED method is to set the timebase to that of the parent
   * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
   */
  struct aom_rational g_timebase;
  /*!\brief Enable error resilient modes.
   *
   * The error resilient bitfield indicates to the encoder which features
   * it should enable to take measures for streaming over lossy or noisy
   * links.
   */
  aom_codec_er_flags_t g_error_resilient;
  /*!\brief Multi-pass Encoding Mode
   *
   * This value should be set to the current phase for multi-pass encoding.
   * For single pass, set to #AOM_RC_ONE_PASS.
   */
  enum aom_enc_pass g_pass;
  /*!\brief Allow lagged encoding
   *
   * If set, this value allows the encoder to consume a number of input
   * frames before producing output frames. This allows the encoder to
   * base decisions for the current frame on future frames. This does
   * increase the latency of the encoding pipeline, so it is not appropriate
   * in all situations (ex: realtime encoding).
   *
   * Note that this is a maximum value -- the encoder may produce frames
   * sooner than the given limit. Set this value to 0 to disable this
   * feature.
   */
  unsigned int g_lag_in_frames;
  /*
   * rate control settings (rc)
   */
  /*!\brief Temporal resampling configuration, if supported by the codec.
   *
   * Temporal resampling allows the codec to "drop" frames as a strategy to
   * meet its target data rate. This can cause temporal discontinuities in
   * the encoded video, which may appear as stuttering during playback. This
   * trade-off is often acceptable, but for many applications is not. It can
   * be disabled in these cases.
   *
   * Note that not all codecs support this feature. All aom AVx codecs do.
   * For other codecs, consult the documentation for that algorithm.
   *
   * This threshold is described as a percentage of the target data buffer.
   * When the data buffer falls below this percentage of fullness, a
   * dropped frame is indicated. Set the threshold to zero (0) to disable
   * this feature.
   */
  unsigned int rc_dropframe_thresh;
  /*!\brief Enable/disable spatial resampling, if supported by the codec.
   *
   * Spatial resampling allows the codec to compress a lower resolution
   * version of the frame, which is then upscaled by the encoder to the
   * correct presentation resolution. This increases visual quality at
   * low data rates, at the expense of CPU time on the encoder/decoder.
   */
  unsigned int rc_resize_allowed;
  /*!\brief Internal coded frame width.
   *
   * If spatial resampling is enabled this specifies the width of the
   * encoded frame.
   */
  unsigned int rc_scaled_width;
  /*!\brief Internal coded frame height.
   *
   * If spatial resampling is enabled this specifies the height of the
   * encoded frame.
   */
  unsigned int rc_scaled_height;
  /*!\brief Spatial resampling up watermark.
   *
   * This threshold is described as a percentage of the target data buffer.
   * When the data buffer rises above this percentage of fullness, the
   * encoder will step up to a higher resolution version of the frame.
   */
  unsigned int rc_resize_up_thresh;
  /*!\brief Spatial resampling down watermark.
   *
   * This threshold is described as a percentage of the target data buffer.
   * When the data buffer falls below this percentage of fullness, the
   * encoder will step down to a lower resolution version of the frame.
   */
  unsigned int rc_resize_down_thresh;
  /*!\brief Rate control algorithm to use.
   *
   * Indicates whether the end usage of this stream is to be streamed over
   * a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
   * mode should be used, or whether it will be played back on a high
   * bandwidth link, as from a local disk, where higher variations in
   * bitrate are acceptable.
   */
  enum aom_rc_mode rc_end_usage;
  /*!\brief Two-pass stats buffer.
   *
   * A buffer containing all of the stats packets produced in the first
   * pass, concatenated.
   */
  aom_fixed_buf_t rc_twopass_stats_in;
  /*!\brief first pass mb stats buffer.
   *
   * A buffer containing all of the first pass mb stats packets produced
   * in the first pass, concatenated.
   */
  aom_fixed_buf_t rc_firstpass_mb_stats_in;
  /*!\brief Target data rate
   *
   * Target bandwidth to use for this stream, in kilobits per second.
   */
  unsigned int rc_target_bitrate;
  /*
   * quantizer settings
   */
  /*!\brief Minimum (Best Quality) Quantizer
   *
   * The quantizer is the most direct control over the quality of the
   * encoded image. The range of valid values for the quantizer is codec
   * specific. Consult the documentation for the codec to determine the
   * values to use. To determine the range programmatically, call
   * aom_codec_enc_config_default() with a usage value of 0.
   */
  unsigned int rc_min_quantizer;
  /*!\brief Maximum (Worst Quality) Quantizer
   *
   * The quantizer is the most direct control over the quality of the
   * encoded image. The range of valid values for the quantizer is codec
   * specific. Consult the documentation for the codec to determine the
   * values to use. To determine the range programmatically, call
   * aom_codec_enc_config_default() with a usage value of 0.
   */
  unsigned int rc_max_quantizer;
  /*
   * bitrate tolerance
   */
  /*!\brief Rate control adaptation undershoot control
   *
   * This value, expressed as a percentage of the target bitrate,
   * controls the maximum allowed adaptation speed of the codec.
   * This factor controls the maximum amount of bits that can
   * be subtracted from the target bitrate in order to compensate
   * for prior overshoot.
   *
   * Valid values in the range 0-1000.
   */
  unsigned int rc_undershoot_pct;
  /*!\brief Rate control adaptation overshoot control
   *
   * This value, expressed as a percentage of the target bitrate,
   * controls the maximum allowed adaptation speed of the codec.
   * This factor controls the maximum amount of bits that can
   * be added to the target bitrate in order to compensate for
   * prior undershoot.
   *
   * Valid values in the range 0-1000.
   */
  unsigned int rc_overshoot_pct;
  /*
   * decoder buffer model parameters
   */
  /*!\brief Decoder Buffer Size
   *
   * This value indicates the amount of data that may be buffered by the
   * decoding application. Note that this value is expressed in units of
   * time (milliseconds). For example, a value of 5000 indicates that the
   * client will buffer (at least) 5000ms worth of encoded data. Use the
   * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
   * necessary.
   */
  unsigned int rc_buf_sz;
  /*!\brief Decoder Buffer Initial Size
   *
   * This value indicates the amount of data that will be buffered by the
   * decoding application prior to beginning playback. This value is
   * expressed in units of time (milliseconds). Use the target bitrate
   * (#rc_target_bitrate) to convert to bits/bytes, if necessary.
   */
  unsigned int rc_buf_initial_sz;
  /*!\brief Decoder Buffer Optimal Size
   *
   * This value indicates the amount of data that the encoder should try
   * to maintain in the decoder's buffer. This value is expressed in units
   * of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
   * to convert to bits/bytes, if necessary.
   */
  unsigned int rc_buf_optimal_sz;
  /*
   * 2 pass rate control parameters
   */
  /*!\brief Two-pass mode CBR/VBR bias
   *
   * Bias, expressed on a scale of 0 to 100, for determining target size
   * for the current frame. The value 0 indicates the optimal CBR mode
   * value should be used. The value 100 indicates the optimal VBR mode
   * value should be used. Values in between indicate which way the
   * encoder should "lean."
   */
  unsigned int rc_2pass_vbr_bias_pct;
  /*!\brief Two-pass mode per-GOP minimum bitrate
   *
   * This value, expressed as a percentage of the target bitrate, indicates
   * the minimum bitrate to be used for a single GOP (aka "section")
   */
  unsigned int rc_2pass_vbr_minsection_pct;
  /*!\brief Two-pass mode per-GOP maximum bitrate
   *
   * This value, expressed as a percentage of the target bitrate, indicates
   * the maximum bitrate to be used for a single GOP (aka "section")
   */
  unsigned int rc_2pass_vbr_maxsection_pct;
  /*
   * keyframing settings (kf)
   */
  /*!\brief Keyframe placement mode
   *
   * This value indicates whether the encoder should place keyframes at a
   * fixed interval, or determine the optimal placement automatically
   * (as governed by the #kf_min_dist and #kf_max_dist parameters)
   */
  enum aom_kf_mode kf_mode;
  /*!\brief Keyframe minimum interval
   *
   * This value, expressed as a number of frames, prevents the encoder from
   * placing a keyframe nearer than kf_min_dist to the previous keyframe. At
   * least kf_min_dist frames non-keyframes will be coded before the next
   * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
   */
  unsigned int kf_min_dist;
  /*!\brief Keyframe maximum interval
   *
   * This value, expressed as a number of frames, forces the encoder to code
   * a keyframe if one has not been coded in the last kf_max_dist frames.
   * A value of 0 implies all frames will be keyframes. Set kf_min_dist
   * equal to kf_max_dist for a fixed interval.
   */
  unsigned int kf_max_dist;
 } aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */
 /*!\brief Initialize an encoder instance
 *
 * Initializes a encoder context using the given interface. Applications
 * should call the aom_codec_enc_init convenience macro instead of this
 * function directly, to ensure that the ABI version number parameter
 * is properly initialized.
 *
 * If the library was configured with --disable-multithread, this call
 * is not thread safe and should be guarded with a lock if being used
 * in a multithreaded context.
 *
 * \param[in]    ctx     Pointer to this instance's context.
 * \param[in]    iface   Pointer to the algorithm interface to use.
 * \param[in]    cfg     Configuration to use, if known. May be NULL.
 * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    ver     ABI version number. Must be set to
 *                       AOM_ENCODER_ABI_VERSION
 * \retval #AOM_CODEC_OK
 *     The decoder algorithm initialized.
 * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
 aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
                                       aom_codec_iface_t *iface,
                                       const aom_codec_enc_cfg_t *cfg,
                                       aom_codec_flags_t flags, int ver);
 /*!\brief Convenience macro for aom_codec_enc_init_ver()
 *
 * Ensures the ABI version parameter is properly set.
 */
 #define aom_codec_enc_init(ctx, iface, cfg, flags) \
  aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION)
 /*!\brief Initialize multi-encoder instance
 *
 * Initializes multi-encoder context using the given interface.
 * Applications should call the aom_codec_enc_init_multi convenience macro
 * instead of this function directly, to ensure that the ABI version number
 * parameter is properly initialized.
 *
 * \param[in]    ctx     Pointer to this instance's context.
 * \param[in]    iface   Pointer to the algorithm interface to use.
 * \param[in]    cfg     Configuration to use, if known. May be NULL.
 * \param[in]    num_enc Total number of encoders.
 * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
 * \param[in]    dsf     Pointer to down-sampling factors.
 * \param[in]    ver     ABI version number. Must be set to
 *                       AOM_ENCODER_ABI_VERSION
 * \retval #AOM_CODEC_OK
 *     The decoder algorithm initialized.
 * \retval #AOM_CODEC_MEM_ERROR
 *     Memory allocation failed.
 */
 aom_codec_err_t aom_codec_enc_init_multi_ver(
    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver);
 /*!\brief Convenience macro for aom_codec_enc_init_multi_ver()
 *
 * Ensures the ABI version parameter is properly set.
 */
 #define aom_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
  aom_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf,   \
                               AOM_ENCODER_ABI_VERSION)
 /*!\brief Get a default configuration
 *
 * Initializes a encoder configuration structure with default values. Supports
 * the notion of "usages" so that an algorithm may offer different default
 * settings depending on the user's intended goal. This function \ref SHOULD
 * be called by all applications to initialize the configuration structure
 * before specializing the configuration with application specific values.
 *
 * \param[in]    iface     Pointer to the algorithm interface to use.
 * \param[out]   cfg       Configuration buffer to populate.
 * \param[in]    reserved  Must set to 0 for VP8 and AV1.
 *
 * \retval #AOM_CODEC_OK
 *     The configuration was populated.
 * \retval #AOM_CODEC_INCAPABLE
 *     Interface is not an encoder interface.
 * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, or the usage value was not recognized.
 */
 aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
                                             aom_codec_enc_cfg_t *cfg,
                                             unsigned int reserved);
 /*!\brief Set or change configuration
 *
 * Reconfigures an encoder instance according to the given configuration.
 *
 * \param[in]    ctx     Pointer to this instance's context
 * \param[in]    cfg     Configuration buffer to use
 *
 * \retval #AOM_CODEC_OK
 *     The configuration was populated.
 * \retval #AOM_CODEC_INCAPABLE
 *     Interface is not an encoder interface.
 * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, or the usage value was not recognized.
 */
 aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
                                         const aom_codec_enc_cfg_t *cfg);
 /*!\brief Get global stream headers
 *
 * Retrieves a stream level global header packet, if supported by the codec.
 *
 * \param[in]    ctx     Pointer to this instance's context
 *
 * \retval NULL
 *     Encoder does not support global header
 * \retval Non-NULL
 *     Pointer to buffer containing global header packet
 */
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 /*!\brief deadline parameter analogous to AVx REALTIME mode. */
 #define AOM_DL_REALTIME (1)
 /*!\brief deadline parameter analogous to  AVx GOOD QUALITY mode. */
 #define AOM_DL_GOOD_QUALITY (1000000)
 /*!\brief deadline parameter analogous to AVx BEST QUALITY mode. */
 #define AOM_DL_BEST_QUALITY (0)
 /*!\brief Encode a frame
 *
 * Encodes a video frame at the given "presentation time." The presentation
 * time stamp (PTS) \ref MUST be strictly increasing.
 *
 * The encoder supports the notion of a soft real-time deadline. Given a
 * non-zero value to the deadline parameter, the encoder will make a "best
 * effort" guarantee to  return before the given time slice expires. It is
 * implicit that limiting the available time to encode will degrade the
 * output quality. The encoder can be given an unlimited time to produce the
 * best possible frame by specifying a deadline of '0'. This deadline
 * supercedes the AVx notion of "best quality, good quality, realtime".
 * Applications that wish to map these former settings to the new deadline
 * based system can use the symbols #AOM_DL_REALTIME, #AOM_DL_GOOD_QUALITY,
 * and #AOM_DL_BEST_QUALITY.
 *
 * When the last frame has been passed to the encoder, this function should
 * continue to be called, with the img parameter set to NULL. This will
 * signal the end-of-stream condition to the encoder and allow it to encode
 * any held buffers. Encoding is complete when aom_codec_encode() is called
 * and aom_codec_get_cx_data() returns no data.
 *
 * \param[in]    ctx       Pointer to this instance's context
 * \param[in]    img       Image data to encode, NULL to flush.
 * \param[in]    pts       Presentation time stamp, in timebase units.
 * \param[in]    duration  Duration to show frame, in timebase units.
 * \param[in]    flags     Flags to use for encoding this frame.
 * \param[in]    deadline  Time to spend encoding, in microseconds. (0=infinite)
 *
 * \retval #AOM_CODEC_OK
 *     The configuration was populated.
 * \retval #AOM_CODEC_INCAPABLE
 *     Interface is not an encoder interface.
 * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, the image format is unsupported, etc.
 */
 aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
                                 aom_codec_pts_t pts, unsigned long duration,
                                 aom_enc_frame_flags_t flags,
                                 unsigned long deadline);
 /*!\brief Set compressed data output buffer
 *
 * Sets the buffer that the codec should output the compressed data
 * into. This call effectively sets the buffer pointer returned in the
 * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
 * appended into this buffer. The buffer is preserved across frames,
 * so applications must periodically call this function after flushing
 * the accumulated compressed data to disk or to the network to reset
 * the pointer to the buffer's head.
 *
 * `pad_before` bytes will be skipped before writing the compressed
 * data, and `pad_after` bytes will be appended to the packet. The size
 * of the packet will be the sum of the size of the actual compressed
 * data, pad_before, and pad_after. The padding bytes will be preserved
 * (not overwritten).
 *
 * Note that calling this function does not guarantee that the returned
 * compressed data will be placed into the specified buffer. In the
 * event that the encoded data will not fit into the buffer provided,
 * the returned packet \ref MAY point to an internal buffer, as it would
 * if this call were never used. In this event, the output packet will
 * NOT have any padding, and the application must free space and copy it
 * to the proper place. This is of particular note in configurations
 * that may output multiple packets for a single encoded frame (e.g., lagged
 * encoding) or if the application does not reset the buffer periodically.
 *
 * Applications may restore the default behavior of the codec providing
 * the compressed data buffer by calling this function with a NULL
 * buffer.
 *
 * Applications \ref MUSTNOT call this function during iteration of
 * aom_codec_get_cx_data().
 *
 * \param[in]    ctx         Pointer to this instance's context
 * \param[in]    buf         Buffer to store compressed data into
 * \param[in]    pad_before  Bytes to skip before writing compressed data
 * \param[in]    pad_after   Bytes to skip after writing compressed data
 *
 * \retval #AOM_CODEC_OK
 *     The buffer was set successfully.
 * \retval #AOM_CODEC_INVALID_PARAM
 *     A parameter was NULL, the image format is unsupported, etc.
 */
 aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
                                          const aom_fixed_buf_t *buf,
                                          unsigned int pad_before,
                                          unsigned int pad_after);
 /*!\brief Encoded data iterator
 *
 * Iterates over a list of data packets to be passed from the encoder to the
 * application. The different kinds of packets available are enumerated in
 * #aom_codec_cx_pkt_kind.
 *
 * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's
 * muxer. Multiple compressed frames may be in the list.
 * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer.
 *
 * The application \ref MUST silently ignore any packet kinds that it does
 * not recognize or support.
 *
 * The data buffers returned from this function are only guaranteed to be
 * valid until the application makes another call to any aom_codec_* function.
 *
 * \param[in]     ctx      Pointer to this instance's context
 * \param[in,out] iter     Iterator storage, initialized to NULL
 *
 * \return Returns a pointer to an output data packet (compressed frame data,
 *         two-pass statistics, etc.) or NULL to signal end-of-list.
 *
 */
 const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
                                                aom_codec_iter_t *iter);
 /*!\brief Get Preview Frame
 *
 * Returns an image that can be used as a preview. Shows the image as it would
 * exist at the decompressor. The application \ref MUST NOT write into this
 * image buffer.
 *
 * \param[in]     ctx      Pointer to this instance's context
 *
 * \return Returns a pointer to a preview image, or NULL if no image is
 *         available.
 *
 */
 const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx);
 /*!@} - end defgroup encoder*/
 #ifdef __cplusplus
 }
 #endif
 #endif  // AOM_AOM_ENCODER_H_
--- a/aom/aom_frame_buffer.h
+++ b/aom/aom_frame_buffer.h
@@ -1,15 +1,16 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
-#ifndef VPX_VPX_FRAME_BUFFER_H_
+#ifndef AOM_AOM_FRAME_BUFFER_H_
-#define VPX_VPX_FRAME_BUFFER_H_
+#define AOM_AOM_FRAME_BUFFER_H_
 /*!\file
 * \brief Describes the decoder external frame buffer interface.
@@ -19,28 +20,28 @@
 extern "C" {
 #endif
-#include "./vpx_integer.h"
+#include "./aom_integer.h"
-/*!\brief The maximum number of work buffers used by libvpx.
+/*!\brief The maximum number of work buffers used by libaom.
 *  Support maximum 4 threads to decode video in parallel.
 *  Each thread will use one work buffer.
 * TODO(hkuang): Add support to set number of worker threads dynamically.
 */
-#define VPX_MAXIMUM_WORK_BUFFERS 8
+#define AOM_MAXIMUM_WORK_BUFFERS 8
-/*!\brief The maximum number of reference buffers that a VP9 encoder may use.
+/*!\brief The maximum number of reference buffers that a AV1 encoder may use.
 */
-#define VPX_MAXIMUM_REF_BUFFERS 8
+#define AOM_MAXIMUM_REF_BUFFERS 8
 /*!\brief External frame buffer
 *
 * This structure holds allocated frame buffers used by the decoder.
 */
-typedef struct vpx_codec_frame_buffer {
+typedef struct aom_codec_frame_buffer {
-  uint8_t *data;  /**< Pointer to the data buffer */
+  uint8_t *data; /**< Pointer to the data buffer */
-  size_t size;  /**< Size of data in bytes */
+  size_t size;   /**< Size of data in bytes */
-  void *priv;  /**< Frame's private data */
+  void *priv;    /**< Frame's private data */
-} vpx_codec_frame_buffer_t;
+} aom_codec_frame_buffer_t;
 /*!\brief get frame buffer callback prototype
 *
@@ -51,17 +52,17 @@ typedef struct vpx_codec_frame_buffer {
 * to the allocated size. The application does not need to align the allocated
 * data. The callback is triggered when the decoder needs a frame buffer to
 * decode a compressed image into. This function may be called more than once
- * for every call to vpx_codec_decode. The application may set fb->priv to
+ * for every call to aom_codec_decode. The application may set fb->priv to
 * some data which will be passed back in the ximage and the release function
 * call. |fb| is guaranteed to not be NULL. On success the callback must
 * return 0. Any failure the callback must return a value less than 0.
 *
 * \param[in] priv         Callback's private data
 * \param[in] new_size     Size in bytes needed by the buffer
- * \param[in,out] fb       Pointer to vpx_codec_frame_buffer_t
+ * \param[in,out] fb       Pointer to aom_codec_frame_buffer_t
 */
-typedef int (*vpx_get_frame_buffer_cb_fn_t)(
+typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
-    void *priv, size_t min_size, vpx_codec_frame_buffer_t *fb);
+                                            aom_codec_frame_buffer_t *fb);
 /*!\brief release frame buffer callback prototype
 *
@@ -71,13 +72,13 @@ typedef int (*vpx_get_frame_buffer_cb_fn_t)(
 * a value less than 0.
 *
 * \param[in] priv         Callback's private data
- * \param[in] fb           Pointer to vpx_codec_frame_buffer_t
+ * \param[in] fb           Pointer to aom_codec_frame_buffer_t
 */
-typedef int (*vpx_release_frame_buffer_cb_fn_t)(
+typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv,
-    void *priv, vpx_codec_frame_buffer_t *fb);
+                                                aom_codec_frame_buffer_t *fb);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // VPX_VPX_FRAME_BUFFER_H_
+#endif  // AOM_AOM_FRAME_BUFFER_H_
--- a/aom/aom_image.h
+++ b/aom/aom_image.h
@@ -0,0 +1,225 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\file
 * \brief Describes the aom image descriptor and associated operations
 *
 */
 #ifndef AOM_AOM_IMAGE_H_
 #define AOM_AOM_IMAGE_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*!\brief Current ABI version number
 *
 * \internal
 * If this file is altered in any way that changes the ABI, this value
 * must be bumped.  Examples include, but are not limited to, changing
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
 #define AOM_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
 #define AOM_IMG_FMT_PLANAR 0x100       /**< Image is a planar format. */
 #define AOM_IMG_FMT_UV_FLIP 0x200      /**< V plane precedes U in memory. */
 #define AOM_IMG_FMT_HAS_ALPHA 0x400    /**< Image has an alpha channel. */
 #define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
 /*!\brief List of supported image formats */
 typedef enum aom_img_fmt {
  AOM_IMG_FMT_NONE,
  AOM_IMG_FMT_RGB24,     /**< 24 bit per pixel packed RGB */
  AOM_IMG_FMT_RGB32,     /**< 32 bit per pixel packed 0RGB */
  AOM_IMG_FMT_RGB565,    /**< 16 bit per pixel, 565 */
  AOM_IMG_FMT_RGB555,    /**< 16 bit per pixel, 555 */
  AOM_IMG_FMT_UYVY,      /**< UYVY packed YUV */
  AOM_IMG_FMT_YUY2,      /**< YUYV packed YUV */
  AOM_IMG_FMT_YVYU,      /**< YVYU packed YUV */
  AOM_IMG_FMT_BGR24,     /**< 24 bit per pixel packed BGR */
  AOM_IMG_FMT_RGB32_LE,  /**< 32 bit packed BGR0 */
  AOM_IMG_FMT_ARGB,      /**< 32 bit packed ARGB, alpha=255 */
  AOM_IMG_FMT_ARGB_LE,   /**< 32 bit packed BGRA, alpha=255 */
  AOM_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */
  AOM_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
  AOM_IMG_FMT_YV12 =
      AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
  AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2,
  AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP |
                        3, /** < planar 4:2:0 format with aom color space */
  AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
  AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
  AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
  AOM_IMG_FMT_I440 = AOM_IMG_FMT_PLANAR | 7,
  AOM_IMG_FMT_444A = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_HAS_ALPHA | 6,
  AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
  AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
  AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH,
  AOM_IMG_FMT_I44016 = AOM_IMG_FMT_I440 | AOM_IMG_FMT_HIGHBITDEPTH
 } aom_img_fmt_t; /**< alias for enum aom_img_fmt */
 /*!\brief List of supported color spaces */
 typedef enum aom_color_space {
  AOM_CS_UNKNOWN = 0,   /**< Unknown */
  AOM_CS_BT_601 = 1,    /**< BT.601 */
  AOM_CS_BT_709 = 2,    /**< BT.709 */
  AOM_CS_SMPTE_170 = 3, /**< SMPTE.170 */
  AOM_CS_SMPTE_240 = 4, /**< SMPTE.240 */
  AOM_CS_BT_2020 = 5,   /**< BT.2020 */
  AOM_CS_RESERVED = 6,  /**< Reserved */
  AOM_CS_SRGB = 7       /**< sRGB */
 } aom_color_space_t;    /**< alias for enum aom_color_space */
 /*!\brief List of supported color range */
 typedef enum aom_color_range {
  AOM_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
  AOM_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
 } aom_color_range_t;       /**< alias for enum aom_color_range */
 /**\brief Image Descriptor */
 typedef struct aom_image {
  aom_img_fmt_t fmt;       /**< Image Format */
  aom_color_space_t cs;    /**< Color Space */
  aom_color_range_t range; /**< Color Range */
  /* Image storage dimensions */
  unsigned int w;         /**< Stored image width */
  unsigned int h;         /**< Stored image height */
  unsigned int bit_depth; /**< Stored image bit-depth */
  /* Image display dimensions */
  unsigned int d_w; /**< Displayed image width */
  unsigned int d_h; /**< Displayed image height */
  /* Image intended rendering dimensions */
  unsigned int r_w; /**< Intended rendering image width */
  unsigned int r_h; /**< Intended rendering image height */
  /* Chroma subsampling info */
  unsigned int x_chroma_shift; /**< subsampling order, X */
  unsigned int y_chroma_shift; /**< subsampling order, Y */
 /* Image data pointers. */
 #define AOM_PLANE_PACKED 0  /**< To be used for all packed formats */
 #define AOM_PLANE_Y 0       /**< Y (Luminance) plane */
 #define AOM_PLANE_U 1       /**< U (Chroma) plane */
 #define AOM_PLANE_V 2       /**< V (Chroma) plane */
 #define AOM_PLANE_ALPHA 3   /**< A (Transparency) plane */
  unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */
  int stride[4];            /**< stride between rows for each plane */
  int bps; /**< bits per sample (for packed formats) */
  /*!\brief The following member may be set by the application to associate
   * data with this image.
   */
  void *user_priv;
  /* The following members should be treated as private. */
  unsigned char *img_data; /**< private */
  int img_data_owner;      /**< private */
  int self_allocd;         /**< private */
  void *fb_priv; /**< Frame buffer data associated with the image. */
 } aom_image_t;   /**< alias for struct aom_image */
 /**\brief Representation of a rectangle on a surface */
 typedef struct aom_image_rect {
  unsigned int x;   /**< leftmost column */
  unsigned int y;   /**< topmost row */
  unsigned int w;   /**< width */
  unsigned int h;   /**< height */
 } aom_image_rect_t; /**< alias for struct aom_image_rect */
 /*!\brief Open a descriptor, allocating storage for the underlying image
 *
 * Returns a descriptor for storing an image of the given format. The
 * storage for the descriptor is allocated on the heap.
 *
 * \param[in]    img       Pointer to storage for descriptor. If this parameter
 *                         is NULL, the storage for the descriptor will be
 *                         allocated on the heap.
 * \param[in]    fmt       Format for the image
 * \param[in]    d_w       Width of the image
 * \param[in]    d_h       Height of the image
 * \param[in]    align     Alignment, in bytes, of the image buffer and
 *                         each row in the image(stride).
 *
 * \return Returns a pointer to the initialized image descriptor. If the img
 *         parameter is non-null, the value of the img parameter will be
 *         returned.
 */
 aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
                           unsigned int d_w, unsigned int d_h,
                           unsigned int align);
 /*!\brief Open a descriptor, using existing storage for the underlying image
 *
 * Returns a descriptor for storing an image of the given format. The
 * storage for descriptor has been allocated elsewhere, and a descriptor is
 * desired to "wrap" that storage.
 *
 * \param[in]    img       Pointer to storage for descriptor. If this parameter
 *                         is NULL, the storage for the descriptor will be
 *                         allocated on the heap.
 * \param[in]    fmt       Format for the image
 * \param[in]    d_w       Width of the image
 * \param[in]    d_h       Height of the image
 * \param[in]    align     Alignment, in bytes, of each row in the image.
 * \param[in]    img_data  Storage to use for the image
 *
 * \return Returns a pointer to the initialized image descriptor. If the img
 *         parameter is non-null, the value of the img parameter will be
 *         returned.
 */
 aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
                          unsigned int d_h, unsigned int align,
                          unsigned char *img_data);
 /*!\brief Set the rectangle identifying the displayed portion of the image
 *
 * Updates the displayed rectangle (aka viewport) on the image surface to
 * match the specified coordinates and size.
 *
 * \param[in]    img       Image descriptor
 * \param[in]    x         leftmost column
 * \param[in]    y         topmost row
 * \param[in]    w         width
 * \param[in]    h         height
 *
 * \return 0 if the requested rectangle is valid, nonzero otherwise.
 */
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
                     unsigned int w, unsigned int h);
 /*!\brief Flip the image vertically (top for bottom)
 *
 * Adjusts the image descriptor's pointers and strides to make the image
 * be referenced upside-down.
 *
 * \param[in]    img       Image descriptor
 */
 void aom_img_flip(aom_image_t *img);
 /*!\brief Close an image descriptor
 *
 * Frees all allocated storage associated with an image descriptor.
 *
 * \param[in]    img       Image descriptor
 */
 void aom_img_free(aom_image_t *img);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_AOM_IMAGE_H_
--- a/aom/aom_integer.h
+++ b/aom/aom_integer.h
@@ -0,0 +1,64 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_AOM_INTEGER_H_
 #define AOM_AOM_INTEGER_H_
 /* get ptrdiff_t, size_t, wchar_t, NULL */
 #include <stddef.h>
 #if defined(_MSC_VER)
 #define AOM_FORCE_INLINE __forceinline
 #define AOM_INLINE __inline
 #else
 #define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
 // TODO(jbb): Allow a way to force inline off for older compilers.
 #define AOM_INLINE inline
 #endif
 #if defined(AOM_EMULATE_INTTYPES)
 typedef signed char int8_t;
 typedef signed short int16_t;
 typedef signed int int32_t;
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 #ifndef _UINTPTR_T_DEFINED
 typedef size_t uintptr_t;
 #endif
 #else
 /* Most platforms have the C99 standard integer types. */
 #if defined(__cplusplus)
 #if !defined(__STDC_FORMAT_MACROS)
 #define __STDC_FORMAT_MACROS
 #endif
 #if !defined(__STDC_LIMIT_MACROS)
 #define __STDC_LIMIT_MACROS
 #endif
 #endif  // __cplusplus
 #include <stdint.h>
 #endif
 /* VS2010 defines stdint.h, but not inttypes.h */
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #define PRId64 "I64d"
 #else
 #include <inttypes.h>
 #endif
 #endif  // AOM_AOM_INTEGER_H_
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -0,0 +1,759 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_AOMCX_H_
 #define AOM_AOMCX_H_
 /*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
 * \ingroup aom
 *
 * @{
 */
 #include "./aom.h"
 #include "./aom_encoder.h"
 /*!\file
 * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
 *        aom Codec Interface.
 */
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*!\name Algorithm interface for AV1
 *
 * This interface provides the capability to encode raw AV1 streams.
 * @{
 */
 extern aom_codec_iface_t aom_codec_av1_cx_algo;
 extern aom_codec_iface_t *aom_codec_av1_cx(void);
 /*!@} - end algorithm interface member group*/
 /*
 * Algorithm Flags
 */
 /*!\brief Don't reference the last frame
 *
 * When this flag is set, the encoder will not use the last frame as a
 * predictor. When not set, the encoder will choose whether to use the
 * last frame or not automatically.
 */
 #define AOM_EFLAG_NO_REF_LAST (1 << 16)
 /*!\brief Don't reference the golden frame
 *
 * When this flag is set, the encoder will not use the golden frame as a
 * predictor. When not set, the encoder will choose whether to use the
 * golden frame or not automatically.
 */
 #define AOM_EFLAG_NO_REF_GF (1 << 17)
 /*!\brief Don't reference the alternate reference frame
 *
 * When this flag is set, the encoder will not use the alt ref frame as a
 * predictor. When not set, the encoder will choose whether to use the
 * alt ref frame or not automatically.
 */
 #define AOM_EFLAG_NO_REF_ARF (1 << 21)
 /*!\brief Don't update the last frame
 *
 * When this flag is set, the encoder will not update the last frame with
 * the contents of the current frame.
 */
 #define AOM_EFLAG_NO_UPD_LAST (1 << 18)
 /*!\brief Don't update the golden frame
 *
 * When this flag is set, the encoder will not update the golden frame with
 * the contents of the current frame.
 */
 #define AOM_EFLAG_NO_UPD_GF (1 << 22)
 /*!\brief Don't update the alternate reference frame
 *
 * When this flag is set, the encoder will not update the alt ref frame with
 * the contents of the current frame.
 */
 #define AOM_EFLAG_NO_UPD_ARF (1 << 23)
 /*!\brief Force golden frame update
 *
 * When this flag is set, the encoder copy the contents of the current frame
 * to the golden frame buffer.
 */
 #define AOM_EFLAG_FORCE_GF (1 << 19)
 /*!\brief Force alternate reference frame update
 *
 * When this flag is set, the encoder copy the contents of the current frame
 * to the alternate reference frame buffer.
 */
 #define AOM_EFLAG_FORCE_ARF (1 << 24)
 /*!\brief Disable entropy update
 *
 * When this flag is set, the encoder will not update its internal entropy
 * model based on the entropy of this frame.
 */
 #define AOM_EFLAG_NO_UPD_ENTROPY (1 << 20)
 /*!\brief AVx encoder control functions
 *
 * This set of macros define the control functions available for AVx
 * encoder interface.
 *
 * \sa #aom_codec_control
 */
 enum aome_enc_control_id {
  /*!\brief Codec control function to set which reference frame encoder can use.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_USE_REFERENCE = 7,
  /*!\brief Codec control function to pass an ROI map to encoder.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_ROI_MAP = 8,
  /*!\brief Codec control function to pass an Active map to encoder.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_ACTIVEMAP,
  /*!\brief Codec control function to set encoder scaling mode.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_SCALEMODE = 11,
  /*!\brief Codec control function to set encoder internal speed settings.
   *
   * Changes in this value influences, among others, the encoder's selection
   * of motion estimation methods. Values greater than 0 will increase encoder
   * speed at the expense of quality.
   *
   * \note Valid range for VP8: -16..16
   * \note Valid range for AV1: -8..8
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_CPUUSED = 13,
  /*!\brief Codec control function to enable automatic set and use alf frames.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_ENABLEAUTOALTREF,
 #if CONFIG_EXT_REFS
  /*!\brief Codec control function to enable automatic set and use
   * bwd-pred frames.
   *
   * Supported in codecs: AV1
   */
  AOME_SET_ENABLEAUTOBWDREF,
 #endif  // CONFIG_EXT_REFS
  /*!\brief control function to set noise sensitivity
   *
   * 0: off, 1: OnYOnly, 2: OnYUV,
   * 3: OnYUVAggressive, 4: Adaptive
   *
   * Supported in codecs: VP8
   */
  AOME_SET_NOISE_SENSITIVITY,
  /*!\brief Codec control function to set sharpness.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_SHARPNESS,
  /*!\brief Codec control function to set the threshold for MBs treated static.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_STATIC_THRESHOLD,
  /*!\brief Codec control function to set the number of token partitions.
   *
   * Supported in codecs: VP8
   */
  AOME_SET_TOKEN_PARTITIONS,
  /*!\brief Codec control function to get last quantizer chosen by the encoder.
   *
   * Return value uses internal quantizer scale defined by the codec.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_GET_LAST_QUANTIZER,
  /*!\brief Codec control function to get last quantizer chosen by the encoder.
   *
   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
   * parameters.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_GET_LAST_QUANTIZER_64,
  /*!\brief Codec control function to set the max no of frames to create arf.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_ARNR_MAXFRAMES,
  /*!\brief Codec control function to set the filter strength for the arf.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_ARNR_STRENGTH,
  /*!\deprecated control function to set the filter type to use for the arf. */
  AOME_SET_ARNR_TYPE,
  /*!\brief Codec control function to set visual tuning.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_TUNING,
  /*!\brief Codec control function to set constrained quality level.
   *
   * \attention For this value to be used aom_codec_enc_cfg_t::g_usage must be
   *            set to #AOM_CQ.
   * \note Valid range: 0..63
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_CQ_LEVEL,
  /*!\brief Codec control function to set Max data rate for Intra frames.
   *
   * This value controls additional clamping on the maximum size of a
   * keyframe. It is expressed as a percentage of the average
   * per-frame bitrate, with the special (and default) value 0 meaning
   * unlimited, or no additional clamping beyond the codec's built-in
   * algorithm.
   *
   * For example, to allocate no more than 4.5 frames worth of bitrate
   * to a keyframe, set this to 450.
   *
   * Supported in codecs: VP8, AV1
   */
  AOME_SET_MAX_INTRA_BITRATE_PCT,
  /*!\brief Codec control function to set reference and update frame flags.
   *
   *  Supported in codecs: VP8
   */
  AOME_SET_FRAME_FLAGS,
  /*!\brief Codec control function to set max data rate for Inter frames.
   *
   * This value controls additional clamping on the maximum size of an
   * inter frame. It is expressed as a percentage of the average
   * per-frame bitrate, with the special (and default) value 0 meaning
   * unlimited, or no additional clamping beyond the codec's built-in
   * algorithm.
   *
   * For example, to allow no more than 4.5 frames worth of bitrate
   * to an inter frame, set this to 450.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_MAX_INTER_BITRATE_PCT,
  /*!\brief Boost percentage for Golden Frame in CBR mode.
   *
   * This value controls the amount of boost given to Golden Frame in
   * CBR mode. It is expressed as a percentage of the average
   * per-frame bitrate, with the special (and default) value 0 meaning
   * the feature is off, i.e., no golden frame boost in CBR mode and
   * average bitrate target is used.
   *
   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
   * than average frame, set this to 100.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_GF_CBR_BOOST_PCT,
  /*!\brief Codec control function to set encoder screen content mode.
   *
   * 0: off, 1: On, 2: On with more aggressive rate control.
   *
   * Supported in codecs: VP8
   */
  AOME_SET_SCREEN_CONTENT_MODE,
  /*!\brief Codec control function to set lossless encoding mode.
   *
   * AV1 can operate in lossless encoding mode, in which the bitstream
   * produced will be able to decode and reconstruct a perfect copy of
   * input source. This control function provides a mean to switch encoder
   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
   *                          0 = lossy coding mode
   *                          1 = lossless coding mode
   *
   *  By default, encoder operates in normal coding mode (maybe lossy).
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_LOSSLESS,
 #if CONFIG_AOM_QM
  /*!\brief Codec control function to encode with quantisation matrices.
   *
   * AOM can operate with default quantisation matrices dependent on
   * quantisation level and block type.
   *                          0 = do not use quantisation matrices
   *                          1 = use quantisation matrices
   *
   *  By default, the encoder operates without quantisation matrices.
   *
   * Supported in codecs: AOM
   */
  AV1E_SET_ENABLE_QM,
  /*!\brief Codec control function to set the min quant matrix flatness.
   *
   * AOM can operate with different ranges of quantisation matrices.
   * As quantisation levels increase, the matrices get flatter. This
   * control sets the minimum level of flatness from which the matrices
   * are determined.
   *
   *  By default, the encoder sets this minimum at half the available
   *  range.
   *
   * Supported in codecs: AOM
   */
  AV1E_SET_QM_MIN,
  /*!\brief Codec control function to set the max quant matrix flatness.
   *
   * AOM can operate with different ranges of quantisation matrices.
   * As quantisation levels increase, the matrices get flatter. This
   * control sets the maximum level of flatness possible.
   *
   * By default, the encoder sets this maximum at the top of the
   * available range.
   *
   * Supported in codecs: AOM
   */
  AV1E_SET_QM_MAX,
 #endif
  /*!\brief Codec control function to set number of tile columns.
   *
   * In encoding and decoding, AV1 allows an input image frame be partitioned
   * into separated vertical tile columns, which can be encoded or decoded
   * independently. This enables easy implementation of parallel encoding and
   * decoding. This control requests the encoder to use column tiles in
   * encoding an input frame, with number of tile columns (in Log2 unit) as
   * the parameter:
   *             0 = 1 tile column
   *             1 = 2 tile columns
   *             2 = 4 tile columns
   *             .....
   *             n = 2**n tile columns
   * The requested tile columns will be capped by encoder based on image size
   * limitation (The minimum width of a tile column is 256 pixel, the maximum
   * is 4096).
   *
   * By default, the value is 0, i.e. one single column tile for entire image.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_TILE_COLUMNS,
  /*!\brief Codec control function to set number of tile rows.
   *
   * In encoding and decoding, AV1 allows an input image frame be partitioned
   * into separated horizontal tile rows. Tile rows are encoded or decoded
   * sequentially. Even though encoding/decoding of later tile rows depends on
   * earlier ones, this allows the encoder to output data packets for tile rows
   * prior to completely processing all tile rows in a frame, thereby reducing
   * the latency in processing between input and output. The parameter
   * for this control describes the number of tile rows, which has a valid
   * range [0, 2]:
   *            0 = 1 tile row
   *            1 = 2 tile rows
   *            2 = 4 tile rows
   *
   * By default, the value is 0, i.e. one single row tile for entire image.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_TILE_ROWS,
  /*!\brief Codec control function to enable frame parallel decoding feature.
   *
   * AV1 has a bitstream feature to reduce decoding dependency between frames
   * by turning off backward update of probability context used in encoding
   * and decoding. This allows staged parallel processing of more than one
   * video frames in the decoder. This control function provides a mean to
   * turn this feature on or off for bitstreams produced by encoder.
   *
   * By default, this feature is off.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_FRAME_PARALLEL_DECODING,
  /*!\brief Codec control function to set adaptive quantization mode.
   *
   * AV1 has a segment based feature that allows encoder to adaptively change
   * quantization parameter for each segment within a frame to improve the
   * subjective quality. This control makes encoder operate in one of the
   * several AQ_modes supported.
   *
   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_AQ_MODE,
  /*!\brief Codec control function to enable/disable periodic Q boost.
   *
   * One AV1 encoder speed feature is to enable quality boost by lowering
   * frame level Q periodically. This control function provides a mean to
   * turn on/off this feature.
   *               0 = off
   *               1 = on
   *
   * By default, the encoder is allowed to use this feature for appropriate
   * encoding modes.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_FRAME_PERIODIC_BOOST,
  /*!\brief Codec control function to set noise sensitivity.
   *
   *  0: off, 1: On(YOnly)
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_NOISE_SENSITIVITY,
  /*!\brief Codec control function to set content type.
   * \note Valid parameter range:
   *              AOM_CONTENT_DEFAULT = Regular video content (Default)
   *              AOM_CONTENT_SCREEN  = Screen capture content
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_TUNE_CONTENT,
  /*!\brief Codec control function to set color space info.
   * \note Valid ranges: 0..7, default is "UNKNOWN".
   *                     0 = UNKNOWN,
   *                     1 = BT_601
   *                     2 = BT_709
   *                     3 = SMPTE_170
   *                     4 = SMPTE_240
   *                     5 = BT_2020
   *                     6 = RESERVED
   *                     7 = SRGB
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_COLOR_SPACE,
  /*!\brief Codec control function to set minimum interval between GF/ARF frames
   *
   * By default the value is set as 4.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_MIN_GF_INTERVAL,
  /*!\brief Codec control function to set minimum interval between GF/ARF frames
   *
   * By default the value is set as 16.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_MAX_GF_INTERVAL,
  /*!\brief Codec control function to get an Active map back from the encoder.
   *
   * Supported in codecs: AV1
   */
  AV1E_GET_ACTIVEMAP,
  /*!\brief Codec control function to set color range bit.
   * \note Valid ranges: 0..1, default is 0
   *                     0 = Limited range (16..235 or HBD equivalent)
   *                     1 = Full range (0..255 or HBD equivalent)
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_COLOR_RANGE,
  /*!\brief Codec control function to set intended rendering image size.
   *
   * By default, this is identical to the image size in pixels.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_RENDER_SIZE,
  /*!\brief Codec control function to set target level.
   *
   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
   * 11: target for level 1.1; ... 62: target for level 6.2
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_TARGET_LEVEL,
  /*!\brief Codec control function to get bitstream level.
   *
   * Supported in codecs: AV1
   */
  AV1E_GET_LEVEL,
  /*!\brief Codec control function to set intended superblock size.
   *
   * By default, the superblock size is determined separately for each
   * frame by the encoder.
   *
   * Supported in codecs: AV1
   */
  AV1E_SET_SUPERBLOCK_SIZE,
 };
 /*!\brief aom 1-D scaling mode
 *
 * This set of constants define 1-D aom scaling modes
 */
 typedef enum aom_scaling_mode_1d {
  AOME_NORMAL = 0,
  AOME_FOURFIVE = 1,
  AOME_THREEFIVE = 2,
  AOME_ONETWO = 3
 } AOM_SCALING_MODE;
 /*!\brief  aom region of interest map
 *
 * These defines the data structures for the region of interest map
 *
 */
 typedef struct aom_roi_map {
  /*! An id between 0 and 3 for each 16x16 region within a frame. */
  unsigned char *roi_map;
  unsigned int rows; /**< Number of rows. */
  unsigned int cols; /**< Number of columns. */
  // TODO(paulwilkins): broken for AV1 which has 8 segments
  // q and loop filter deltas for each segment
  // (see MAX_MB_SEGMENTS)
  int delta_q[4];  /**< Quantizer deltas. */
  int delta_lf[4]; /**< Loop filter deltas. */
  /*! Static breakout threshold for each segment. */
  unsigned int static_threshold[4];
 } aom_roi_map_t;
 /*!\brief  aom active region map
 *
 * These defines the data structures for active region map
 *
 */
 typedef struct aom_active_map {
  /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
  unsigned char *active_map;
  unsigned int rows; /**< number of rows */
  unsigned int cols; /**< number of cols */
 } aom_active_map_t;
 /*!\brief  aom image scaling mode
 *
 * This defines the data structure for image scaling mode
 *
 */
 typedef struct aom_scaling_mode {
  AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
  AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode   */
 } aom_scaling_mode_t;
 /*!\brief VP8 token partition mode
 *
 * This defines VP8 partitioning mode for compressed data, i.e., the number of
 * sub-streams in the bitstream. Used for parallelized decoding.
 *
 */
 typedef enum {
  AOM_ONE_TOKENPARTITION = 0,
  AOM_TWO_TOKENPARTITION = 1,
  AOM_FOUR_TOKENPARTITION = 2,
  AOM_EIGHT_TOKENPARTITION = 3
 } aome_token_partitions;
 /*!brief AV1 encoder content type */
 typedef enum {
  AOM_CONTENT_DEFAULT,
  AOM_CONTENT_SCREEN,
  AOM_CONTENT_INVALID
 } aom_tune_content;
 /*!\brief VP8 model tuning parameters
 *
 * Changes the encoder to tune for certain types of input material.
 *
 */
 typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
 /*!\cond */
 /*!\brief VP8 encoder control function parameter type
 *
 * Defines the data types that VP8E control functions take. Note that
 * additional common controls are defined in aom.h
 *
 */
 AOM_CTRL_USE_TYPE_DEPRECATED(AOME_USE_REFERENCE, int)
 #define AOM_CTRL_AOME_USE_REFERENCE
 AOM_CTRL_USE_TYPE(AOME_SET_FRAME_FLAGS, int)
 #define AOM_CTRL_AOME_SET_FRAME_FLAGS
 AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
 #define AOM_CTRL_AOME_SET_ROI_MAP
 AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
 #define AOM_CTRL_AOME_SET_ACTIVEMAP
 AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
 #define AOM_CTRL_AOME_SET_SCALEMODE
 AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
 #define AOM_CTRL_AOME_SET_CPUUSED
 AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
 #define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
 #if CONFIG_EXT_REFS
 AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
 #define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
 #endif  // CONFIG_EXT_REFS
 AOM_CTRL_USE_TYPE(AOME_SET_NOISE_SENSITIVITY, unsigned int)
 #define AOM_CTRL_AOME_SET_NOISE_SENSITIVITY
 AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
 #define AOM_CTRL_AOME_SET_SHARPNESS
 AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
 #define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
 AOM_CTRL_USE_TYPE(AOME_SET_TOKEN_PARTITIONS, int) /* aome_token_partitions */
 #define AOM_CTRL_AOME_SET_TOKEN_PARTITIONS
 AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
 #define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
 AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
 #define AOM_CTRL_AOME_SET_ARNR_STRENGTH
 AOM_CTRL_USE_TYPE_DEPRECATED(AOME_SET_ARNR_TYPE, unsigned int)
 #define AOM_CTRL_AOME_SET_ARNR_TYPE
 AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
 #define AOM_CTRL_AOME_SET_TUNING
 AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
 #define AOM_CTRL_AOME_SET_CQ_LEVEL
 AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
 #define AOM_CTRL_AV1E_SET_TILE_COLUMNS
 AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
 #define AOM_CTRL_AV1E_SET_TILE_ROWS
 AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
 #define AOM_CTRL_AOME_GET_LAST_QUANTIZER
 AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
 #define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
 AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 #define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
 AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
 #define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
 AOM_CTRL_USE_TYPE(AOME_SET_SCREEN_CONTENT_MODE, unsigned int)
 #define AOM_CTRL_AOME_SET_SCREEN_CONTENT_MODE
 AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
 #define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
 AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
 #define AOM_CTRL_AV1E_SET_LOSSLESS
 #if CONFIG_AOM_QM
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
 #define AOM_CTRL_AV1E_SET_ENABLE_QM
 AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
 #define AOM_CTRL_AV1E_SET_QM_MIN
 AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
 #define AOM_CTRL_AV1E_SET_QM_MAX
 #endif
 AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
 AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
 #define AOM_CTRL_AV1E_SET_AQ_MODE
 AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 #define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
 AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
 #define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
 AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
 #define AOM_CTRL_AV1E_SET_TUNE_CONTENT
 AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_SPACE, int)
 #define AOM_CTRL_AV1E_SET_COLOR_SPACE
 AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
 #define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
 AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
 #define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
 AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
 #define AOM_CTRL_AV1E_GET_ACTIVEMAP
 AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
 #define AOM_CTRL_AV1E_SET_COLOR_RANGE
 /*!\brief
 *
 * TODO(rbultje) : add support of the control in ffmpeg
 */
 #define AOM_CTRL_AV1E_SET_RENDER_SIZE
 AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
 AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
 #define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
 AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
 #define AOM_CTRL_AV1E_SET_TARGET_LEVEL
 AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
 #define AOM_CTRL_AV1E_GET_LEVEL
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_AOMCX_H_
--- a/aom/aomdx.h
+++ b/aom/aomdx.h
@@ -0,0 +1,191 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
 * \ingroup aom
 *
 * @{
 */
 /*!\file
 * \brief Provides definitions for using AOM or AV1 within the aom Decoder
 *        interface.
 */
 #ifndef AOM_AOMDX_H_
 #define AOM_AOMDX_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Include controls common to both the encoder and decoder */
 #include "./aom.h"
 /*!\name Algorithm interface for AV1
 *
 * This interface provides the capability to decode AV1 streams.
 * @{
 */
 extern aom_codec_iface_t aom_codec_av1_dx_algo;
 extern aom_codec_iface_t *aom_codec_av1_dx(void);
 /*!@} - end algorithm interface member group*/
 /** Data structure that stores bit accounting for debug
 */
 typedef struct Accounting Accounting;
 /*!\enum aom_dec_control_id
 * \brief AOM decoder control functions
 *
 * This set of macros define the control functions available for the AOM
 * decoder interface.
 *
 * \sa #aom_codec_control
 */
 enum aom_dec_control_id {
  /** control function to get info on which reference frames were updated
   *  by the last decode
   */
  AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
  /** check if the indicated frame is corrupted */
  AOMD_GET_FRAME_CORRUPTED,
  /** control function to get info on which reference frames were used
   *  by the last decode
   */
  AOMD_GET_LAST_REF_USED,
  /** decryption function to decrypt encoded buffer data immediately
   * before decoding. Takes a aom_decrypt_init, which contains
   * a callback function and opaque context pointer.
   */
  AOMD_SET_DECRYPTOR,
  // AOMD_SET_DECRYPTOR = AOMD_SET_DECRYPTOR,
  /** control function to get the dimensions that the current frame is decoded
   * at. This may be different to the intended display size for the frame as
   * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
  AV1D_GET_FRAME_SIZE,
  /** control function to get the current frame's intended display dimensions
   * (as specified in the wrapper or frame header). This may be different to
   * the decoded dimensions of this frame (see AV1D_GET_FRAME_SIZE). */
  AV1D_GET_DISPLAY_SIZE,
  /** control function to get the bit depth of the stream. */
  AV1D_GET_BIT_DEPTH,
  /** control function to set the byte alignment of the planes in the reference
   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
   * follows Y plane, and V plane directly follows U plane. Default value is 0.
   */
  AV1_SET_BYTE_ALIGNMENT,
  /** control function to invert the decoding order to from right to left. The
   * function is used in a test to confirm the decoding independence of tile
   * columns. The function may be used in application where this order
   * of decoding is desired.
   *
   * TODO(yaowu): Rework the unit test that uses this control, and in a future
   *              release, this test-only control shall be removed.
   */
  AV1_INVERT_TILE_DECODE_ORDER,
  /** control function to set the skip loop filter flag. Valid values are
   * integers. The decoder will skip the loop filter when its value is set to
   * nonzero. If the loop filter is skipped the decoder may accumulate decode
   * artifacts. The default value is 0.
   */
  AV1_SET_SKIP_LOOP_FILTER,
  /** control function to retrieve a pointer to the Accounting struct.  When
   * compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
   * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
   * The caller should ensure that AOM_CODEC_OK is returned before attempting
   * to dereference the Accounting pointer.
   */
  AV1_GET_ACCOUNTING,
  AOM_DECODER_CTRL_ID_MAX,
  /** control function to set the range of tile decoding. A value that is
   * greater and equal to zero indicates only the specific row/column is
   * decoded. A value that is -1 indicates the whole row/column is decoded.
   * A special case is both values are -1 that means the whole frame is
   * decoded.
   */
  AV1_SET_DECODE_TILE_ROW,
  AV1_SET_DECODE_TILE_COL
 };
 /** Decrypt n bytes of data from input -> output, using the decrypt_state
 *  passed in AOMD_SET_DECRYPTOR.
 */
 typedef void (*aom_decrypt_cb)(void *decrypt_state, const unsigned char *input,
                               unsigned char *output, int count);
 /*!\brief Structure to hold decryption state
 *
 * Defines a structure to hold the decryption state and access function.
 */
 typedef struct aom_decrypt_init {
  /*! Decrypt callback. */
  aom_decrypt_cb decrypt_cb;
  /*! Decryption state. */
  void *decrypt_state;
 } aom_decrypt_init;
 /*!\brief A deprecated alias for aom_decrypt_init.
 */
 typedef aom_decrypt_init aom_decrypt_init;
 /*!\cond */
 /*!\brief AOM decoder control function parameter type
 *
 * Defines the data types that AOMD control functions take. Note that
 * additional common controls are defined in aom.h
 *
 */
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
 #define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
 AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
 #define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
 AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
 #define AOM_CTRL_AOMD_GET_LAST_REF_USED
 AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
 #define AOM_CTRL_AOMD_SET_DECRYPTOR
 // AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
 //#define AOM_CTRL_AOMD_SET_DECRYPTOR
 AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
 AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
 #define AOM_CTRL_AV1D_GET_BIT_DEPTH
 AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
 #define AOM_CTRL_AV1D_GET_FRAME_SIZE
 AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
 #define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
 AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
 #define AOM_CTRL_AV1_GET_ACCOUNTING
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_COL
 /*!\endcond */
 /*! @} - end defgroup aom_decoder */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_AOMDX_H_
--- a/aom/exports_com
+++ b/aom/exports_com
@@ -0,0 +1,16 @@
 text aom_codec_build_config
 text aom_codec_control_
 text aom_codec_destroy
 text aom_codec_err_to_string
 text aom_codec_error
 text aom_codec_error_detail
 text aom_codec_get_caps
 text aom_codec_iface_name
 text aom_codec_version
 text aom_codec_version_extra_str
 text aom_codec_version_str
 text aom_img_alloc
 text aom_img_flip
 text aom_img_free
 text aom_img_set_rect
 text aom_img_wrap
--- a/aom/exports_dec
+++ b/aom/exports_dec
@@ -0,0 +1,8 @@
 text aom_codec_dec_init_ver
 text aom_codec_decode
 text aom_codec_get_frame
 text aom_codec_get_stream_info
 text aom_codec_peek_stream_info
 text aom_codec_register_put_frame_cb
 text aom_codec_register_put_slice_cb
 text aom_codec_set_frame_buffer_functions
--- a/aom/exports_enc
+++ b/aom/exports_enc
@@ -0,0 +1,9 @@
 text aom_codec_enc_config_default
 text aom_codec_enc_config_set
 text aom_codec_enc_init_multi_ver
 text aom_codec_enc_init_ver
 text aom_codec_encode
 text aom_codec_get_cx_data
 text aom_codec_get_global_headers
 text aom_codec_get_preview_frame
 text aom_codec_set_cx_data_buf
--- a/aom/internal/aom_codec_internal.h
+++ b/aom/internal/aom_codec_internal.h
@@ -0,0 +1,465 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\file
 * \brief Describes the decoder algorithm interface for algorithm
 *        implementations.
 *
 * This file defines the private structures and data types that are only
 * relevant to implementing an algorithm, as opposed to using it.
 *
 * To create a decoder algorithm class, an interface structure is put
 * into the global namespace:
 *     <pre>
 *     my_codec.c:
 *       aom_codec_iface_t my_codec = {
 *           "My Codec v1.0",
 *           AOM_CODEC_ALG_ABI_VERSION,
 *           ...
 *       };
 *     </pre>
 *
 * An application instantiates a specific decoder instance by using
 * aom_codec_init() and a pointer to the algorithm's interface structure:
 *     <pre>
 *     my_app.c:
 *       extern aom_codec_iface_t my_codec;
 *       {
 *           aom_codec_ctx_t algo;
 *           res = aom_codec_init(&algo, &my_codec);
 *       }
 *     </pre>
 *
 * Once initialized, the instance is manged using other functions from
 * the aom_codec_* family.
 */
 #ifndef AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
 #define AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
 #include "./aom_config.h"
 #include "../aom_decoder.h"
 #include "../aom_encoder.h"
 #include <stdarg.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*!\brief Current ABI version number
 *
 * \internal
 * If this file is altered in any way that changes the ABI, this value
 * must be bumped.  Examples include, but are not limited to, changing
 * types, removing or reassigning enums, adding/removing/rearranging
 * fields to structures
 */
 #define AOM_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
 typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
 typedef struct aom_codec_priv_enc_mr_cfg aom_codec_priv_enc_mr_cfg_t;
 /*!\brief init function pointer prototype
 *
 * Performs algorithm-specific initialization of the decoder context. This
 * function is called by the generic aom_codec_init() wrapper function, so
 * plugins implementing this interface may trust the input parameters to be
 * properly initialized.
 *
 * \param[in] ctx   Pointer to this instance's context
 * \retval #AOM_CODEC_OK
 *     The input stream was recognized and decoder initialized.
 * \retval #AOM_CODEC_MEM_ERROR
 *     Memory operation failed.
 */
 typedef aom_codec_err_t (*aom_codec_init_fn_t)(
    aom_codec_ctx_t *ctx, aom_codec_priv_enc_mr_cfg_t *data);
 /*!\brief destroy function pointer prototype
 *
 * Performs algorithm-specific destruction of the decoder context. This
 * function is called by the generic aom_codec_destroy() wrapper function,
 * so plugins implementing this interface may trust the input parameters
 * to be properly initialized.
 *
 * \param[in] ctx   Pointer to this instance's context
 * \retval #AOM_CODEC_OK
 *     The input stream was recognized and decoder initialized.
 * \retval #AOM_CODEC_MEM_ERROR
 *     Memory operation failed.
 */
 typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx);
 /*!\brief parse stream info function pointer prototype
 *
 * Performs high level parsing of the bitstream. This function is called by the
 * generic aom_codec_peek_stream_info() wrapper function, so plugins
 * implementing this interface may trust the input parameters to be properly
 * initialized.
 *
 * \param[in]      data    Pointer to a block of data to parse
 * \param[in]      data_sz Size of the data buffer
 * \param[in,out]  si      Pointer to stream info to update. The size member
 *                         \ref MUST be properly initialized, but \ref MAY be
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
 * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
 typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
                                                  unsigned int data_sz,
                                                  aom_codec_stream_info_t *si);
 /*!\brief Return information about the current stream.
 *
 * Returns information about the stream that has been parsed during decoding.
 *
 * \param[in]      ctx     Pointer to this instance's context
 * \param[in,out]  si      Pointer to stream info to update. The size member
 *                         \ref MUST be properly initialized, but \ref MAY be
 *                         clobbered by the algorithm. This parameter \ref MAY
 *                         be NULL.
 *
 * \retval #AOM_CODEC_OK
 *     Bitstream is parsable and stream information updated
 */
 typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx,
                                                 aom_codec_stream_info_t *si);
 /*!\brief control function pointer prototype
 *
 * This function is used to exchange algorithm specific data with the decoder
 * instance. This can be used to implement features specific to a particular
 * algorithm.
 *
 * This function is called by the generic aom_codec_control() wrapper
 * function, so plugins implementing this interface may trust the input
 * parameters to be properly initialized. However,  this interface does not
 * provide type safety for the exchanged data or assign meanings to the
 * control codes. Those details should be specified in the algorithm's
 * header file. In particular, the ctrl_id parameter is guaranteed to exist
 * in the algorithm's control mapping table, and the data parameter may be NULL.
 *
 *
 * \param[in]     ctx              Pointer to this instance's context
 * \param[in]     ctrl_id          Algorithm specific control identifier
 * \param[in,out] data             Data to exchange with algorithm instance.
 *
 * \retval #AOM_CODEC_OK
 *     The internal state data was deserialized.
 */
 typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx,
                                                  va_list ap);
 /*!\brief control function pointer mapping
 *
 * This structure stores the mapping between control identifiers and
 * implementing functions. Each algorithm provides a list of these
 * mappings. This list is searched by the aom_codec_control() wrapper
 * function to determine which function to invoke. The special
 * value {0, NULL} is used to indicate end-of-list, and must be
 * present. The special value {0, <non-null>} can be used as a catch-all
 * mapping. This implies that ctrl_id values chosen by the algorithm
 * \ref MUST be non-zero.
 */
 typedef const struct aom_codec_ctrl_fn_map {
  int ctrl_id;
  aom_codec_control_fn_t fn;
 } aom_codec_ctrl_fn_map_t;
 /*!\brief decode data function pointer prototype
 *
 * Processes a buffer of coded data. If the processing results in a new
 * decoded frame becoming available, #AOM_CODEC_CB_PUT_SLICE and
 * #AOM_CODEC_CB_PUT_FRAME events are generated as appropriate. This
 * function is called by the generic aom_codec_decode() wrapper function,
 * so plugins implementing this interface may trust the input parameters
 * to be properly initialized.
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] data         Pointer to this block of new coded data. If
 *                         NULL, a #AOM_CODEC_CB_PUT_FRAME event is posted
 *                         for the previously decoded frame.
 * \param[in] data_sz      Size of the coded data, in bytes.
 *
 * \return Returns #AOM_CODEC_OK if the coded data was processed completely
 *         and future pictures can be decoded without error. Otherwise,
 *         see the descriptions of the other error codes in ::aom_codec_err_t
 *         for recoverability capabilities.
 */
 typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx,
                                                 const uint8_t *data,
                                                 unsigned int data_sz,
                                                 void *user_priv,
                                                 long deadline);
 /*!\brief Decoded frames iterator
 *
 * Iterates over a list of the frames available for display. The iterator
 * storage should be initialized to NULL to start the iteration. Iteration is
 * complete when this function returns NULL.
 *
 * The list of available frames becomes valid upon completion of the
 * aom_codec_decode call, and remains valid until the next call to
 * aom_codec_decode.
 *
 * \param[in]     ctx      Pointer to this instance's context
 * \param[in out] iter     Iterator storage, initialized to NULL
 *
 * \return Returns a pointer to an image, if one is ready for display. Frames
 *         produced will always be in PTS (presentation time stamp) order.
 */
 typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx,
                                                 aom_codec_iter_t *iter);
 /*!\brief Pass in external frame buffers for the decoder to use.
 *
 * Registers functions to be called when libaom needs a frame buffer
 * to decode the current frame and a function to be called when libaom does
 * not internally reference the frame buffer. This set function must
 * be called before the first call to decode or libaom will assume the
 * default behavior of allocating frame buffers internally.
 *
 * \param[in] ctx          Pointer to this instance's context
 * \param[in] cb_get       Pointer to the get callback function
 * \param[in] cb_release   Pointer to the release callback function
 * \param[in] cb_priv      Callback's private data
 *
 * \retval #AOM_CODEC_OK
 *     External frame buffers will be used by libaom.
 * \retval #AOM_CODEC_INVALID_PARAM
 *     One or more of the callbacks were NULL.
 * \retval #AOM_CODEC_ERROR
 *     Decoder context not initialized, or algorithm not capable of
 *     using external frame buffers.
 *
 * \note
 * When decoding AV1, the application may be required to pass in at least
 * #AOM_MAXIMUM_WORK_BUFFERS external frame
 * buffers.
 */
 typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)(
    aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
 typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx,
                                                 const aom_image_t *img,
                                                 aom_codec_pts_t pts,
                                                 unsigned long duration,
                                                 aom_enc_frame_flags_t flags,
                                                 unsigned long deadline);
 typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)(
    aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter);
 typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)(
    aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg);
 typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
    aom_codec_alg_priv_t *ctx);
 typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
    aom_codec_alg_priv_t *ctx);
 typedef aom_codec_err_t (*aom_codec_enc_mr_get_mem_loc_fn_t)(
    const aom_codec_enc_cfg_t *cfg, void **mem_loc);
 /*!\brief usage configuration mapping
 *
 * This structure stores the mapping between usage identifiers and
 * configuration structures. Each algorithm provides a list of these
 * mappings. This list is searched by the aom_codec_enc_config_default()
 * wrapper function to determine which config to return. The special value
 * {-1, {0}} is used to indicate end-of-list, and must be present. At least
 * one mapping must be present, in addition to the end-of-list.
 *
 */
 typedef const struct aom_codec_enc_cfg_map {
  int usage;
  aom_codec_enc_cfg_t cfg;
 } aom_codec_enc_cfg_map_t;
 /*!\brief Decoder algorithm interface interface
 *
 * All decoders \ref MUST expose a variable of this type.
 */
 struct aom_codec_iface {
  const char *name;                   /**< Identification String  */
  int abi_version;                    /**< Implemented ABI version */
  aom_codec_caps_t caps;              /**< Decoder capabilities */
  aom_codec_init_fn_t init;           /**< \copydoc ::aom_codec_init_fn_t */
  aom_codec_destroy_fn_t destroy;     /**< \copydoc ::aom_codec_destroy_fn_t */
  aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */
  struct aom_codec_dec_iface {
    aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */
    aom_codec_get_si_fn_t get_si;   /**< \copydoc ::aom_codec_get_si_fn_t */
    aom_codec_decode_fn_t decode;   /**< \copydoc ::aom_codec_decode_fn_t */
    aom_codec_get_frame_fn_t
        get_frame;                   /**< \copydoc ::aom_codec_get_frame_fn_t */
    aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */
  } dec;
  struct aom_codec_enc_iface {
    int cfg_map_count;
    aom_codec_enc_cfg_map_t
        *cfg_maps;                /**< \copydoc ::aom_codec_enc_cfg_map_t */
    aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */
    aom_codec_get_cx_data_fn_t
        get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */
    aom_codec_enc_config_set_fn_t
        cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */
    aom_codec_get_global_headers_fn_t
        get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */
    aom_codec_get_preview_frame_fn_t
        get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
    aom_codec_enc_mr_get_mem_loc_fn_t
        mr_get_mem_loc; /**< \copydoc ::aom_codec_enc_mr_get_mem_loc_fn_t */
  } enc;
 };
 /*!\brief Callback function pointer / user data pair storage */
 typedef struct aom_codec_priv_cb_pair {
  union {
    aom_codec_put_frame_cb_fn_t put_frame;
    aom_codec_put_slice_cb_fn_t put_slice;
  } u;
  void *user_priv;
 } aom_codec_priv_cb_pair_t;
 /*!\brief Instance private storage
 *
 * This structure is allocated by the algorithm's init function. It can be
 * extended in one of two ways. First, a second, algorithm specific structure
 * can be allocated and the priv member pointed to it. Alternatively, this
 * structure can be made the first member of the algorithm specific structure,
 * and the pointer cast to the proper type.
 */
 struct aom_codec_priv {
  const char *err_detail;
  aom_codec_flags_t init_flags;
  struct {
    aom_codec_priv_cb_pair_t put_frame_cb;
    aom_codec_priv_cb_pair_t put_slice_cb;
  } dec;
  struct {
    aom_fixed_buf_t cx_data_dst_buf;
    unsigned int cx_data_pad_before;
    unsigned int cx_data_pad_after;
    aom_codec_cx_pkt_t cx_data_pkt;
    unsigned int total_encoders;
  } enc;
 };
 /*
 * Multi-resolution encoding internal configuration
 */
 struct aom_codec_priv_enc_mr_cfg {
  unsigned int mr_total_resolutions;
  unsigned int mr_encoder_id;
  struct aom_rational mr_down_sampling_factor;
  void *mr_low_res_mode_info;
 };
 #undef AOM_CTRL_USE_TYPE
 #define AOM_CTRL_USE_TYPE(id, typ) \
  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
 #undef AOM_CTRL_USE_TYPE_DEPRECATED
 #define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ) \
  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
 #define CAST(id, arg) id##__value(arg)
 /* CODEC_INTERFACE convenience macro
 *
 * By convention, each codec interface is a struct with extern linkage, where
 * the symbol is suffixed with _algo. A getter function is also defined to
 * return a pointer to the struct, since in some cases it's easier to work
 * with text symbols than data symbols (see issue #169). This function has
 * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
 * macro is provided to define this getter function automatically.
 */
 #define CODEC_INTERFACE(id)                          \
  aom_codec_iface_t *id(void) { return &id##_algo; } \
  aom_codec_iface_t id##_algo
 /* Internal Utility Functions
 *
 * The following functions are intended to be used inside algorithms as
 * utilities for manipulating aom_codec_* data structures.
 */
 struct aom_codec_pkt_list {
  unsigned int cnt;
  unsigned int max;
  struct aom_codec_cx_pkt pkts[1];
 };
 #define aom_codec_pkt_list_decl(n)     \
  union {                              \
    struct aom_codec_pkt_list head;    \
    struct {                           \
      struct aom_codec_pkt_list head;  \
      struct aom_codec_cx_pkt pkts[n]; \
    } alloc;                           \
  }
 #define aom_codec_pkt_list_init(m) \
  (m)->alloc.head.cnt = 0,         \
  (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
 int aom_codec_pkt_list_add(struct aom_codec_pkt_list *,
                           const struct aom_codec_cx_pkt *);
 const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter);
 #include <stdio.h>
 #include <setjmp.h>
 struct aom_internal_error_info {
  aom_codec_err_t error_code;
  int has_detail;
  char detail[80];
  int setjmp;
  jmp_buf jmp;
 };
 #define CLANG_ANALYZER_NORETURN
 #if defined(__has_feature)
 #if __has_feature(attribute_analyzer_noreturn)
 #undef CLANG_ANALYZER_NORETURN
 #define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
 #endif
 #endif
 void aom_internal_error(struct aom_internal_error_info *info,
                        aom_codec_err_t error, const char *fmt,
                        ...) CLANG_ANALYZER_NORETURN;
 #if CONFIG_DEBUG
 #define AOM_CHECK_MEM_ERROR(error_info, lval, expr)                         \
  do {                                                                      \
    lval = (expr);                                                          \
    if (!lval)                                                              \
      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,                   \
                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
                         __LINE__);                                         \
  } while (0)
 #else
 #define AOM_CHECK_MEM_ERROR(error_info, lval, expr)       \
  do {                                                    \
    lval = (expr);                                        \
    if (!lval)                                            \
      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
                         "Failed to allocate " #lval);    \
  } while (0)
 #endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
--- a/aom/src/aom_codec.c
+++ b/aom/src/aom_codec.c
@@ -0,0 +1,134 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\file
 * \brief Provides the high level interface to wrap decoder algorithms.
 *
 */
 #include <stdarg.h>
 #include <stdlib.h>
 #include "aom/aom_integer.h"
 #include "aom/internal/aom_codec_internal.h"
 #include "aom_version.h"
 #define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
 int aom_codec_version(void) { return VERSION_PACKED; }
 const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
 const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
 const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
  return iface ? iface->name : "<invalid interface>";
 }
 const char *aom_codec_err_to_string(aom_codec_err_t err) {
  switch (err) {
    case AOM_CODEC_OK: return "Success";
    case AOM_CODEC_ERROR: return "Unspecified internal error";
    case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
    case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
    case AOM_CODEC_INCAPABLE:
      return "Codec does not implement requested capability";
    case AOM_CODEC_UNSUP_BITSTREAM:
      return "Bitstream not supported by this decoder";
    case AOM_CODEC_UNSUP_FEATURE:
      return "Bitstream required feature not supported by this decoder";
    case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
    case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
    case AOM_CODEC_LIST_END: return "End of iterated list";
  }
  return "Unrecognized error code";
 }
 const char *aom_codec_error(aom_codec_ctx_t *ctx) {
  return (ctx) ? aom_codec_err_to_string(ctx->err)
               : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
 }
 const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
  if (ctx && ctx->err)
    return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
  return NULL;
 }
 aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
  aom_codec_err_t res;
  if (!ctx)
    res = AOM_CODEC_INVALID_PARAM;
  else if (!ctx->iface || !ctx->priv)
    res = AOM_CODEC_ERROR;
  else {
    ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
    ctx->iface = NULL;
    ctx->name = NULL;
    ctx->priv = NULL;
    res = AOM_CODEC_OK;
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
  return (iface) ? iface->caps : 0;
 }
 aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
  aom_codec_err_t res;
  if (!ctx || !ctrl_id)
    res = AOM_CODEC_INVALID_PARAM;
  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
    res = AOM_CODEC_ERROR;
  else {
    aom_codec_ctrl_fn_map_t *entry;
    res = AOM_CODEC_ERROR;
    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
        va_list ap;
        va_start(ap, ctrl_id);
        res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
        va_end(ap);
        break;
      }
    }
  }
  return SAVE_STATUS(ctx, res);
 }
 void aom_internal_error(struct aom_internal_error_info *info,
                        aom_codec_err_t error, const char *fmt, ...) {
  va_list ap;
  info->error_code = error;
  info->has_detail = 0;
  if (fmt) {
    size_t sz = sizeof(info->detail);
    info->has_detail = 1;
    va_start(ap, fmt);
    vsnprintf(info->detail, sz - 1, fmt, ap);
    va_end(ap);
    info->detail[sz - 1] = '\0';
  }
  if (info->setjmp) longjmp(info->jmp, info->error_code);
 }
--- a/aom/src/aom_decoder.c
+++ b/aom/src/aom_decoder.c
@@ -0,0 +1,189 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\file
 * \brief Provides the high level interface to wrap decoder algorithms.
 *
 */
 #include <string.h>
 #include "aom/internal/aom_codec_internal.h"
 #define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
 static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
  return (aom_codec_alg_priv_t *)ctx->priv;
 }
 aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
                                       aom_codec_iface_t *iface,
                                       const aom_codec_dec_cfg_t *cfg,
                                       aom_codec_flags_t flags, int ver) {
  aom_codec_err_t res;
  if (ver != AOM_DECODER_ABI_VERSION)
    res = AOM_CODEC_ABI_MISMATCH;
  else if (!ctx || !iface)
    res = AOM_CODEC_INVALID_PARAM;
  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
    res = AOM_CODEC_ABI_MISMATCH;
  else if ((flags & AOM_CODEC_USE_POSTPROC) &&
           !(iface->caps & AOM_CODEC_CAP_POSTPROC))
    res = AOM_CODEC_INCAPABLE;
  else if ((flags & AOM_CODEC_USE_ERROR_CONCEALMENT) &&
           !(iface->caps & AOM_CODEC_CAP_ERROR_CONCEALMENT))
    res = AOM_CODEC_INCAPABLE;
  else if ((flags & AOM_CODEC_USE_INPUT_FRAGMENTS) &&
           !(iface->caps & AOM_CODEC_CAP_INPUT_FRAGMENTS))
    res = AOM_CODEC_INCAPABLE;
  else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
    res = AOM_CODEC_INCAPABLE;
  else {
    memset(ctx, 0, sizeof(*ctx));
    ctx->iface = iface;
    ctx->name = iface->name;
    ctx->priv = NULL;
    ctx->init_flags = flags;
    ctx->config.dec = cfg;
    res = ctx->iface->init(ctx, NULL);
    if (res) {
      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
      aom_codec_destroy(ctx);
    }
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
                                           const uint8_t *data,
                                           unsigned int data_sz,
                                           aom_codec_stream_info_t *si) {
  aom_codec_err_t res;
  if (!iface || !data || !data_sz || !si ||
      si->sz < sizeof(aom_codec_stream_info_t))
    res = AOM_CODEC_INVALID_PARAM;
  else {
    /* Set default/unknown values */
    si->w = 0;
    si->h = 0;
    res = iface->dec.peek_si(data, data_sz, si);
  }
  return res;
 }
 aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
                                          aom_codec_stream_info_t *si) {
  aom_codec_err_t res;
  if (!ctx || !si || si->sz < sizeof(aom_codec_stream_info_t))
    res = AOM_CODEC_INVALID_PARAM;
  else if (!ctx->iface || !ctx->priv)
    res = AOM_CODEC_ERROR;
  else {
    /* Set default/unknown values */
    si->w = 0;
    si->h = 0;
    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
                                 unsigned int data_sz, void *user_priv,
                                 long deadline) {
  aom_codec_err_t res;
  /* Sanity checks */
  /* NULL data ptr allowed if data_sz is 0 too */
  if (!ctx || (!data && data_sz) || (data && !data_sz))
    res = AOM_CODEC_INVALID_PARAM;
  else if (!ctx->iface || !ctx->priv)
    res = AOM_CODEC_ERROR;
  else {
    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
                                 deadline);
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
  aom_image_t *img;
  if (!ctx || !iter || !ctx->iface || !ctx->priv)
    img = NULL;
  else
    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
  return img;
 }
 aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
                                                aom_codec_put_frame_cb_fn_t cb,
                                                void *user_priv) {
  aom_codec_err_t res;
  if (!ctx || !cb)
    res = AOM_CODEC_INVALID_PARAM;
  else if (!ctx->iface || !ctx->priv ||
           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_FRAME))
    res = AOM_CODEC_ERROR;
  else {
    ctx->priv->dec.put_frame_cb.u.put_frame = cb;
    ctx->priv->dec.put_frame_cb.user_priv = user_priv;
    res = AOM_CODEC_OK;
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
                                                aom_codec_put_slice_cb_fn_t cb,
                                                void *user_priv) {
  aom_codec_err_t res;
  if (!ctx || !cb)
    res = AOM_CODEC_INVALID_PARAM;
  else if (!ctx->iface || !ctx->priv ||
           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_SLICE))
    res = AOM_CODEC_ERROR;
  else {
    ctx->priv->dec.put_slice_cb.u.put_slice = cb;
    ctx->priv->dec.put_slice_cb.user_priv = user_priv;
    res = AOM_CODEC_OK;
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_codec_err_t aom_codec_set_frame_buffer_functions(
    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
  aom_codec_err_t res;
  if (!ctx || !cb_get || !cb_release) {
    res = AOM_CODEC_INVALID_PARAM;
  } else if (!ctx->iface || !ctx->priv ||
             !(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
    res = AOM_CODEC_ERROR;
  } else {
    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
                                    cb_priv);
  }
  return SAVE_STATUS(ctx, res);
 }
--- a/aom/src/aom_encoder.c
+++ b/aom/src/aom_encoder.c
@@ -0,0 +1,380 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 /*!\file
 * \brief Provides the high level interface to wrap encoder algorithms.
 *
 */
 #include <limits.h>
 #include <string.h>
 #include "aom_config.h"
 #include "aom/internal/aom_codec_internal.h"
 #define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
 static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
  return (aom_codec_alg_priv_t *)ctx->priv;
 }
 aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
                                       aom_codec_iface_t *iface,
                                       const aom_codec_enc_cfg_t *cfg,
                                       aom_codec_flags_t flags, int ver) {
  aom_codec_err_t res;
  if (ver != AOM_ENCODER_ABI_VERSION)
    res = AOM_CODEC_ABI_MISMATCH;
  else if (!ctx || !iface || !cfg)
    res = AOM_CODEC_INVALID_PARAM;
  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
    res = AOM_CODEC_ABI_MISMATCH;
  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
    res = AOM_CODEC_INCAPABLE;
  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
    res = AOM_CODEC_INCAPABLE;
  else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
           !(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
    res = AOM_CODEC_INCAPABLE;
  else {
    ctx->iface = iface;
    ctx->name = iface->name;
    ctx->priv = NULL;
    ctx->init_flags = flags;
    ctx->config.enc = cfg;
    res = ctx->iface->init(ctx, NULL);
    if (res) {
      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
      aom_codec_destroy(ctx);
    }
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_codec_err_t aom_codec_enc_init_multi_ver(
    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver) {
  aom_codec_err_t res = AOM_CODEC_OK;
  if (ver != AOM_ENCODER_ABI_VERSION)
    res = AOM_CODEC_ABI_MISMATCH;
  else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
    res = AOM_CODEC_INVALID_PARAM;
  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
    res = AOM_CODEC_ABI_MISMATCH;
  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
    res = AOM_CODEC_INCAPABLE;
  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
    res = AOM_CODEC_INCAPABLE;
  else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
           !(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
    res = AOM_CODEC_INCAPABLE;
  else {
    int i;
    void *mem_loc = NULL;
    if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
      for (i = 0; i < num_enc; i++) {
        aom_codec_priv_enc_mr_cfg_t mr_cfg;
        /* Validate down-sampling factor. */
        if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
            dsf->den > dsf->num) {
          res = AOM_CODEC_INVALID_PARAM;
          break;
        }
        mr_cfg.mr_low_res_mode_info = mem_loc;
        mr_cfg.mr_total_resolutions = num_enc;
        mr_cfg.mr_encoder_id = num_enc - 1 - i;
        mr_cfg.mr_down_sampling_factor.num = dsf->num;
        mr_cfg.mr_down_sampling_factor.den = dsf->den;
        /* Force Key-frame synchronization. Namely, encoder at higher
         * resolution always use the same frame_type chosen by the
         * lowest-resolution encoder.
         */
        if (mr_cfg.mr_encoder_id) cfg->kf_mode = AOM_KF_DISABLED;
        ctx->iface = iface;
        ctx->name = iface->name;
        ctx->priv = NULL;
        ctx->init_flags = flags;
        ctx->config.enc = cfg;
        res = ctx->iface->init(ctx, &mr_cfg);
        if (res) {
          const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
          /* Destroy current ctx */
          ctx->err_detail = error_detail;
          aom_codec_destroy(ctx);
          /* Destroy already allocated high-level ctx */
          while (i) {
            ctx--;
            ctx->err_detail = error_detail;
            aom_codec_destroy(ctx);
            i--;
          }
        }
        if (res) break;
        ctx++;
        cfg++;
        dsf++;
      }
      ctx--;
    }
  }
  return SAVE_STATUS(ctx, res);
 }
 aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
                                             aom_codec_enc_cfg_t *cfg,
                                             unsigned int usage) {
  aom_codec_err_t res;
  aom_codec_enc_cfg_map_t *map;
  int i;
  if (!iface || !cfg || usage > INT_MAX)
    res = AOM_CODEC_INVALID_PARAM;
  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
    res = AOM_CODEC_INCAPABLE;
  else {
    res = AOM_CODEC_INVALID_PARAM;
    for (i = 0; i < iface->enc.cfg_map_count; ++i) {
      map = iface->enc.cfg_maps + i;
      if (map->usage == (int)usage) {
        *cfg = map->cfg;
        cfg->g_usage = usage;
        res = AOM_CODEC_OK;
        break;
      }
    }
  }
  return res;
 }
 #if ARCH_X86 || ARCH_X86_64
 /* On X86, disable the x87 unit's internal 80 bit precision for better
 * consistency with the SSE unit's 64 bit precision.
 */
 #include "aom_ports/x86.h"
 #define FLOATING_POINT_INIT() \
  do {                        \
    unsigned short x87_orig_mode = x87_set_double_precision();
 #define FLOATING_POINT_RESTORE()       \
  x87_set_control_word(x87_orig_mode); \
  }                                    \
  while (0)
 #else
 static void FLOATING_POINT_INIT() {}
 static void FLOATING_POINT_RESTORE() {}
 #endif
 aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
                                 aom_codec_pts_t pts, unsigned long duration,
                                 aom_enc_frame_flags_t flags,
                                 unsigned long deadline) {
  aom_codec_err_t res = AOM_CODEC_OK;
  if (!ctx || (img && !duration))
    res = AOM_CODEC_INVALID_PARAM;
  else if (!ctx->iface || !ctx->priv)
    res = AOM_CODEC_ERROR;
  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
    res = AOM_CODEC_INCAPABLE;
  else {
    unsigned int num_enc = ctx->priv->enc.total_encoders;
    /* Execute in a normalized floating point environment, if the platform
     * requires it.
     */
    FLOATING_POINT_INIT();
    if (num_enc == 1)
      res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags,
                                   deadline);
    else {
      /* Multi-resolution encoding:
       * Encode multi-levels in reverse order. For example,
       * if mr_total_resolutions = 3, first encode level 2,
       * then encode level 1, and finally encode level 0.
       */
      int i;
      ctx += num_enc - 1;
      if (img) img += num_enc - 1;
      for (i = num_enc - 1; i >= 0; i--) {
        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration,
                                          flags, deadline)))
          break;
        ctx--;
        if (img) img--;
      }
      ctx++;
    }
    FLOATING_POINT_RESTORE();
  }
  return SAVE_STATUS(ctx, res);
 }
 const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
                                                aom_codec_iter_t *iter) {
  const aom_codec_cx_pkt_t *pkt = NULL;
  if (ctx) {
    if (!iter)
      ctx->err = AOM_CODEC_INVALID_PARAM;
    else if (!ctx->iface || !ctx->priv)
      ctx->err = AOM_CODEC_ERROR;
    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
      ctx->err = AOM_CODEC_INCAPABLE;
    else
      pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
  }
  if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
    // If the application has specified a destination area for the
    // compressed data, and the codec has not placed the data there,
    // and it fits, copy it.
    aom_codec_priv_t *const priv = ctx->priv;
    char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
    if (dst_buf && pkt->data.raw.buf != dst_buf &&
        pkt->data.raw.sz + priv->enc.cx_data_pad_before +
                priv->enc.cx_data_pad_after <=
            priv->enc.cx_data_dst_buf.sz) {
      aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
      memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
             pkt->data.raw.sz);
      *modified_pkt = *pkt;
      modified_pkt->data.raw.buf = dst_buf;
      modified_pkt->data.raw.sz +=
          priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after;
      pkt = modified_pkt;
    }
    if (dst_buf == pkt->data.raw.buf) {
      priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
      priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
    }
  }
  return pkt;
 }
 aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
                                          const aom_fixed_buf_t *buf,
                                          unsigned int pad_before,
                                          unsigned int pad_after) {
  if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM;
  if (buf) {
    ctx->priv->enc.cx_data_dst_buf = *buf;
    ctx->priv->enc.cx_data_pad_before = pad_before;
    ctx->priv->enc.cx_data_pad_after = pad_after;
  } else {
    ctx->priv->enc.cx_data_dst_buf.buf = NULL;
    ctx->priv->enc.cx_data_dst_buf.sz = 0;
    ctx->priv->enc.cx_data_pad_before = 0;
    ctx->priv->enc.cx_data_pad_after = 0;
  }
  return AOM_CODEC_OK;
 }
 const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) {
  aom_image_t *img = NULL;
  if (ctx) {
    if (!ctx->iface || !ctx->priv)
      ctx->err = AOM_CODEC_ERROR;
    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
      ctx->err = AOM_CODEC_INCAPABLE;
    else if (!ctx->iface->enc.get_preview)
      ctx->err = AOM_CODEC_INCAPABLE;
    else
      img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
  }
  return img;
 }
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) {
  aom_fixed_buf_t *buf = NULL;
  if (ctx) {
    if (!ctx->iface || !ctx->priv)
      ctx->err = AOM_CODEC_ERROR;
    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
      ctx->err = AOM_CODEC_INCAPABLE;
    else if (!ctx->iface->enc.get_glob_hdrs)
      ctx->err = AOM_CODEC_INCAPABLE;
    else
      buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
  }
  return buf;
 }
 aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
                                         const aom_codec_enc_cfg_t *cfg) {
  aom_codec_err_t res;
  if (!ctx || !ctx->iface || !ctx->priv || !cfg)
    res = AOM_CODEC_INVALID_PARAM;
  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
    res = AOM_CODEC_INCAPABLE;
  else
    res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
  return SAVE_STATUS(ctx, res);
 }
 int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list,
                           const struct aom_codec_cx_pkt *pkt) {
  if (list->cnt < list->max) {
    list->pkts[list->cnt++] = *pkt;
    return 0;
  }
  return 1;
 }
 const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) {
  const aom_codec_cx_pkt_t *pkt;
  if (!(*iter)) {
    *iter = list->pkts;
  }
  pkt = (const aom_codec_cx_pkt_t *)*iter;
  if ((size_t)(pkt - list->pkts) < list->cnt)
    *iter = pkt + 1;
  else
    pkt = NULL;
  return pkt;
 }
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -0,0 +1,240 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <stdlib.h>
 #include <string.h>
 #include "aom/aom_image.h"
 #include "aom/aom_integer.h"
 #include "aom_mem/aom_mem.h"
 static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
                                     unsigned int d_w, unsigned int d_h,
                                     unsigned int buf_align,
                                     unsigned int stride_align,
                                     unsigned char *img_data) {
  unsigned int h, w, s, xcs, ycs, bps;
  unsigned int stride_in_bytes;
  int align;
  /* Treat align==0 like align==1 */
  if (!buf_align) buf_align = 1;
  /* Validate alignment (must be power of 2) */
  if (buf_align & (buf_align - 1)) goto fail;
  /* Treat align==0 like align==1 */
  if (!stride_align) stride_align = 1;
  /* Validate alignment (must be power of 2) */
  if (stride_align & (stride_align - 1)) goto fail;
  /* Get sample size for this format */
  switch (fmt) {
    case AOM_IMG_FMT_RGB32:
    case AOM_IMG_FMT_RGB32_LE:
    case AOM_IMG_FMT_ARGB:
    case AOM_IMG_FMT_ARGB_LE: bps = 32; break;
    case AOM_IMG_FMT_RGB24:
    case AOM_IMG_FMT_BGR24: bps = 24; break;
    case AOM_IMG_FMT_RGB565:
    case AOM_IMG_FMT_RGB565_LE:
    case AOM_IMG_FMT_RGB555:
    case AOM_IMG_FMT_RGB555_LE:
    case AOM_IMG_FMT_UYVY:
    case AOM_IMG_FMT_YUY2:
    case AOM_IMG_FMT_YVYU: bps = 16; break;
    case AOM_IMG_FMT_I420:
    case AOM_IMG_FMT_YV12:
    case AOM_IMG_FMT_AOMI420:
    case AOM_IMG_FMT_AOMYV12: bps = 12; break;
    case AOM_IMG_FMT_I422:
    case AOM_IMG_FMT_I440: bps = 16; break;
    case AOM_IMG_FMT_I444: bps = 24; break;
    case AOM_IMG_FMT_I42016: bps = 24; break;
    case AOM_IMG_FMT_I42216:
    case AOM_IMG_FMT_I44016: bps = 32; break;
    case AOM_IMG_FMT_I44416: bps = 48; break;
    default: bps = 16; break;
  }
  /* Get chroma shift values for this format */
  switch (fmt) {
    case AOM_IMG_FMT_I420:
    case AOM_IMG_FMT_YV12:
    case AOM_IMG_FMT_AOMI420:
    case AOM_IMG_FMT_AOMYV12:
    case AOM_IMG_FMT_I422:
    case AOM_IMG_FMT_I42016:
    case AOM_IMG_FMT_I42216: xcs = 1; break;
    default: xcs = 0; break;
  }
  switch (fmt) {
    case AOM_IMG_FMT_I420:
    case AOM_IMG_FMT_I440:
    case AOM_IMG_FMT_YV12:
    case AOM_IMG_FMT_AOMI420:
    case AOM_IMG_FMT_AOMYV12:
    case AOM_IMG_FMT_I42016:
    case AOM_IMG_FMT_I44016: ycs = 1; break;
    default: ycs = 0; break;
  }
  /* Calculate storage sizes given the chroma subsampling */
  align = (1 << xcs) - 1;
  w = (d_w + align) & ~align;
  align = (1 << ycs) - 1;
  h = (d_h + align) & ~align;
  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
  s = (s + stride_align - 1) & ~(stride_align - 1);
  stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
  /* Allocate the new image */
  if (!img) {
    img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
    if (!img) goto fail;
    img->self_allocd = 1;
  } else {
    memset(img, 0, sizeof(aom_image_t));
  }
  img->img_data = img_data;
  if (!img_data) {
    const uint64_t alloc_size = (fmt & AOM_IMG_FMT_PLANAR)
                                    ? (uint64_t)h * s * bps / 8
                                    : (uint64_t)h * s;
    if (alloc_size != (size_t)alloc_size) goto fail;
    img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
    img->img_data_owner = 1;
  }
  if (!img->img_data) goto fail;
  img->fmt = fmt;
  img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
  img->w = w;
  img->h = h;
  img->x_chroma_shift = xcs;
  img->y_chroma_shift = ycs;
  img->bps = bps;
  /* Calculate strides */
  img->stride[AOM_PLANE_Y] = img->stride[AOM_PLANE_ALPHA] = stride_in_bytes;
  img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
  /* Default viewport to entire image */
  if (!aom_img_set_rect(img, 0, 0, d_w, d_h)) return img;
 fail:
  aom_img_free(img);
  return NULL;
 }
 aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
                           unsigned int d_w, unsigned int d_h,
                           unsigned int align) {
  return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
 }
 aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
                          unsigned int d_h, unsigned int stride_align,
                          unsigned char *img_data) {
  /* By setting buf_align = 1, we don't change buffer alignment in this
   * function. */
  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
 }
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
                     unsigned int w, unsigned int h) {
  unsigned char *data;
  if (x + w <= img->w && y + h <= img->h) {
    img->d_w = w;
    img->d_h = h;
    /* Calculate plane pointers */
    if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
      img->planes[AOM_PLANE_PACKED] =
          img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
    } else {
      const int bytes_per_sample =
          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
      data = img->img_data;
      if (img->fmt & AOM_IMG_FMT_HAS_ALPHA) {
        img->planes[AOM_PLANE_ALPHA] =
            data + x * bytes_per_sample + y * img->stride[AOM_PLANE_ALPHA];
        data += img->h * img->stride[AOM_PLANE_ALPHA];
      }
      img->planes[AOM_PLANE_Y] =
          data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
      data += img->h * img->stride[AOM_PLANE_Y];
      if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
        img->planes[AOM_PLANE_U] =
            data + (x >> img->x_chroma_shift) * bytes_per_sample +
            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
        img->planes[AOM_PLANE_V] =
            data + (x >> img->x_chroma_shift) * bytes_per_sample +
            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
      } else {
        img->planes[AOM_PLANE_V] =
            data + (x >> img->x_chroma_shift) * bytes_per_sample +
            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
        img->planes[AOM_PLANE_U] =
            data + (x >> img->x_chroma_shift) * bytes_per_sample +
            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
      }
    }
    return 0;
  }
  return -1;
 }
 void aom_img_flip(aom_image_t *img) {
  /* Note: In the calculation pointer adjustment calculation, we want the
   * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
   * standard indicates that if the adjustment parameter is unsigned, the
   * stride parameter will be promoted to unsigned, causing errors when
   * the lhs is a larger type than the rhs.
   */
  img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
  img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
  img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
                              img->stride[AOM_PLANE_U];
  img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
  img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
                              img->stride[AOM_PLANE_V];
  img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
  img->planes[AOM_PLANE_ALPHA] +=
      (signed)(img->d_h - 1) * img->stride[AOM_PLANE_ALPHA];
  img->stride[AOM_PLANE_ALPHA] = -img->stride[AOM_PLANE_ALPHA];
 }
 void aom_img_free(aom_image_t *img) {
  if (img) {
    if (img->img_data && img->img_data_owner) aom_free(img->img_data);
    if (img->self_allocd) free(img);
  }
 }
--- a/aom_dsp/add_noise.c
+++ b/aom_dsp/add_noise.c
@@ -11,22 +11,20 @@
 #include <math.h>
 #include <stdlib.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-#include "vpx_ports/mem.h"
+#include "aom_ports/mem.h"
-void vpx_plane_add_noise_c(uint8_t *start, char *noise,
+void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
-                           char blackclamp[16],
+                           char whiteclamp[16], char bothclamp[16],
                           char whiteclamp[16],
                           char bothclamp[16],
                           unsigned int width, unsigned int height, int pitch) {
  unsigned int i, j;
  for (i = 0; i < height; ++i) {
    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+    char *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
    for (j = 0; j < width; ++j) {
      int v = pos[j];
@@ -45,13 +43,13 @@ static double gaussian(double sigma, double mu, double x) {
         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
 }
-int vpx_setup_noise(double sigma, int size, char *noise) {
+int aom_setup_noise(double sigma, int size, char *noise) {
  char char_dist[256];
  int next = 0, i, j;
  // set up a 256 entry lookup that matches gaussian distribution
  for (i = -32; i < 32; ++i) {
-    const int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
+    const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
    if (a_i) {
      for (j = 0; j < a_i; ++j) {
        char_dist[next + j] = (char)i;
--- a/aom_dsp/ans.c
+++ b/aom_dsp/ans.c
@@ -0,0 +1,64 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <assert.h>
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/ans.h"
 #include "aom_dsp/prob.h"
 static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
  int largest_idx = -1;
  int largest_p = -1;
  int i;
  for (i = 0; i < num_syms; ++i) {
    int p = pdf_tab[i];
    if (p > largest_p) {
      largest_p = p;
      largest_idx = i;
    }
  }
  return largest_idx;
 }
 void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
                              const AnsP8 node_prob,
                              const aom_cdf_prob *const src_pdf, int in_syms) {
  int i;
  int adjustment = RANS_PRECISION;
  const int round_fact = ANS_P8_PRECISION >> 1;
  const AnsP8 p1 = ANS_P8_PRECISION - node_prob;
  const int out_syms = in_syms + 1;
  assert(src_pdf != out_pdf);
  out_pdf[0] = node_prob << (RANS_PROB_BITS - ANS_P8_SHIFT);
  adjustment -= out_pdf[0];
  for (i = 0; i < in_syms; ++i) {
    int p = (p1 * src_pdf[i] + round_fact) >> ANS_P8_SHIFT;
    p = AOMMIN(p, (int)RANS_PRECISION - in_syms);
    p = AOMMAX(p, 1);
    out_pdf[i + 1] = p;
    adjustment -= p;
  }
  // Adjust probabilities so they sum to the total probability
  if (adjustment > 0) {
    i = find_largest(out_pdf, out_syms);
    out_pdf[i] += adjustment;
  } else {
    while (adjustment < 0) {
      i = find_largest(out_pdf, out_syms);
      --out_pdf[i];
      assert(out_pdf[i] > 0);
      adjustment++;
    }
  }
 }
--- a/aom_dsp/ans.h
+++ b/aom_dsp/ans.h
@@ -0,0 +1,44 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_ANS_H_
 #define AOM_DSP_ANS_H_
 // Constants, types and utilities for Asymmetric Numeral Systems
 // http://arxiv.org/abs/1311.2540v2
 #include <assert.h>
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/prob.h"
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 typedef uint8_t AnsP8;
 #define ANS_P8_PRECISION 256u
 #define ANS_P8_SHIFT 8
 #define RANS_PROB_BITS 15
 #define RANS_PRECISION (1u << RANS_PROB_BITS)
 // L_BASE % PRECISION must be 0. Increasing L_BASE beyond 2**15 will cause uabs
 // to overflow.
 #define L_BASE (RANS_PRECISION)
 #define IO_BASE 256
 // Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
 void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
                              const AnsP8 node_prob,
                              const aom_cdf_prob *const src_pdf, int in_syms);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 #endif  // AOM_DSP_ANS_H_
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -0,0 +1,146 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_ANSREADER_H_
 #define AOM_DSP_ANSREADER_H_
 // A uABS and rANS decoder implementation of Asymmetric Numeral Systems
 // http://arxiv.org/abs/1311.2540v2
 #include <assert.h>
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/prob.h"
 #include "aom_dsp/ans.h"
 #include "aom_ports/mem_ops.h"
 #if CONFIG_ACCOUNTING
 #include "av1/common/accounting.h"
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 struct AnsDecoder {
  const uint8_t *buf;
  int buf_offset;
  uint32_t state;
 #if CONFIG_ACCOUNTING
  Accounting *accounting;
 #endif
 };
 static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
  AnsP8 p = ANS_P8_PRECISION - p0;
  int s;
  unsigned xp, sp;
  unsigned state = ans->state;
  while (state < L_BASE && ans->buf_offset > 0) {
    state = state * IO_BASE + ans->buf[--ans->buf_offset];
  }
  sp = state * p;
  xp = sp / ANS_P8_PRECISION;
  s = (sp & 0xFF) >= p0;
  if (s)
    ans->state = xp;
  else
    ans->state = state - xp;
  return s;
 }
 static INLINE int uabs_read_bit(struct AnsDecoder *ans) {
  int s;
  unsigned state = ans->state;
  while (state < L_BASE && ans->buf_offset > 0) {
    state = state * IO_BASE + ans->buf[--ans->buf_offset];
  }
  s = (int)(state & 1);
  ans->state = state >> 1;
  return s;
 }
 struct rans_dec_sym {
  uint8_t val;
  aom_cdf_prob prob;
  aom_cdf_prob cum_prob;  // not-inclusive
 };
 static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
                             aom_cdf_prob rem) {
  int i;
  aom_cdf_prob cum_prob = 0, top_prob;
  // TODO(skal): if critical, could be a binary search.
  // Or, better, an O(1) alias-table.
  for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
    cum_prob = top_prob;
  }
  out->val = i;
  out->prob = top_prob - cum_prob;
  out->cum_prob = cum_prob;
 }
 static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
  unsigned rem;
  unsigned quo;
  struct rans_dec_sym sym;
  while (ans->state < L_BASE && ans->buf_offset > 0) {
    ans->state = ans->state * IO_BASE + ans->buf[--ans->buf_offset];
  }
  quo = ans->state / RANS_PRECISION;
  rem = ans->state % RANS_PRECISION;
  fetch_sym(&sym, tab, rem);
  ans->state = quo * sym.prob + rem - sym.cum_prob;
  return sym.val;
 }
 static INLINE int ans_read_init(struct AnsDecoder *const ans,
                                const uint8_t *const buf, int offset) {
  unsigned x;
  if (offset < 1) return 1;
  ans->buf = buf;
  x = buf[offset - 1] >> 6;
  if (x == 0) {
    ans->buf_offset = offset - 1;
    ans->state = buf[offset - 1] & 0x3F;
  } else if (x == 1) {
    if (offset < 2) return 1;
    ans->buf_offset = offset - 2;
    ans->state = mem_get_le16(buf + offset - 2) & 0x3FFF;
  } else if (x == 2) {
    if (offset < 3) return 1;
    ans->buf_offset = offset - 3;
    ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
  } else if ((buf[offset - 1] & 0xE0) == 0xE0) {
    if (offset < 4) return 1;
    ans->buf_offset = offset - 4;
    ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
  } else {
    // 110xxxxx implies this byte is a superframe marker
    return 1;
  }
 #if CONFIG_ACCOUNTING
  ans->accounting = NULL;
 #endif
  ans->state += L_BASE;
  if (ans->state >= L_BASE * IO_BASE) return 1;
  return 0;
 }
 static INLINE int ans_read_end(struct AnsDecoder *const ans) {
  return ans->state == L_BASE;
 }
 static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
  return ans->state < L_BASE && ans->buf_offset == 0;
 }
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 #endif  // AOM_DSP_ANSREADER_H_
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -0,0 +1,120 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_ANSWRITER_H_
 #define AOM_DSP_ANSWRITER_H_
 // A uABS and rANS encoder implementation of Asymmetric Numeral Systems
 // http://arxiv.org/abs/1311.2540v2
 #include <assert.h>
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/ans.h"
 #include "aom_dsp/prob.h"
 #include "aom_ports/mem_ops.h"
 #include "av1/common/odintrin.h"
 #if RANS_PRECISION <= OD_DIVU_DMAX
 #define ANS_DIVREM(quotient, remainder, dividend, divisor) \
  do {                                                     \
    quotient = OD_DIVU_SMALL((dividend), (divisor));       \
    remainder = (dividend) - (quotient) * (divisor);       \
  } while (0)
 #else
 #define ANS_DIVREM(quotient, remainder, dividend, divisor) \
  do {                                                     \
    quotient = (dividend) / (divisor);                     \
    remainder = (dividend) % (divisor);                    \
  } while (0)
 #endif
 #define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 struct AnsCoder {
  uint8_t *buf;
  int buf_offset;
  uint32_t state;
 };
 static INLINE void ans_write_init(struct AnsCoder *const ans,
                                  uint8_t *const buf) {
  ans->buf = buf;
  ans->buf_offset = 0;
  ans->state = L_BASE;
 }
 static INLINE int ans_write_end(struct AnsCoder *const ans) {
  uint32_t state;
  assert(ans->state >= L_BASE);
  assert(ans->state < L_BASE * IO_BASE);
  state = ans->state - L_BASE;
  if (state < (1 << 6)) {
    ans->buf[ans->buf_offset] = (0x00 << 6) + state;
    return ans->buf_offset + 1;
  } else if (state < (1 << 14)) {
    mem_put_le16(ans->buf + ans->buf_offset, (0x01 << 14) + state);
    return ans->buf_offset + 2;
  } else if (state < (1 << 22)) {
    mem_put_le24(ans->buf + ans->buf_offset, (0x02 << 22) + state);
    return ans->buf_offset + 3;
  } else if (state < (1 << 29)) {
    mem_put_le32(ans->buf + ans->buf_offset, (0x07 << 29) + state);
    return ans->buf_offset + 4;
  } else {
    assert(0 && "State is too large to be serialized");
    return ans->buf_offset;
  }
 }
 // uABS with normalization
 static INLINE void uabs_write(struct AnsCoder *ans, int val, AnsP8 p0) {
  AnsP8 p = ANS_P8_PRECISION - p0;
  const unsigned l_s = val ? p : p0;
  while (ans->state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
    ans->state /= IO_BASE;
  }
  if (!val)
    ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
  else
    ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
 }
 struct rans_sym {
  aom_cdf_prob prob;
  aom_cdf_prob cum_prob;  // not-inclusive
 };
 // rANS with normalization
 // sym->prob takes the place of l_s from the paper
 // ANS_P10_PRECISION is m
 static INLINE void rans_write(struct AnsCoder *ans,
                              const struct rans_sym *const sym) {
  const aom_cdf_prob p = sym->prob;
  unsigned quot, rem;
  while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
    ans->state /= IO_BASE;
  }
  ANS_DIVREM(quot, rem, ans->state, p);
  ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
 }
 #undef ANS_DIV8
 #undef ANS_DIVREM
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 #endif  // AOM_DSP_ANSWRITER_H_
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -1,28 +1,29 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <assert.h>
 #include <string.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-#include "vpx_dsp/vpx_convolve.h"
+#include "aom_dsp/aom_convolve.h"
-#include "vpx_dsp/vpx_dsp_common.h"
+#include "aom_dsp/aom_dsp_common.h"
-#include "vpx_dsp/vpx_filter.h"
+#include "aom_dsp/aom_filter.h"
-#include "vpx_ports/mem.h"
+#include "aom_ports/mem.h"
 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const InterpKernel *x_filters,
+                           const InterpKernel *x_filters, int x0_q4,
-                           int x0_q4, int x_step_q4, int w, int h) {
+                           int x_step_q4, int w, int h) {
  int x, y;
  src -= SUBPEL_TAPS / 2 - 1;
  for (y = 0; y < h; ++y) {
@@ -31,8 +32,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
        sum += src_x[k] * x_filter[k];
      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
      x_q4 += x_step_q4;
    }
@@ -43,8 +43,8 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const InterpKernel *x_filters,
+                               const InterpKernel *x_filters, int x0_q4,
-                               int x0_q4, int x_step_q4, int w, int h) {
+                               int x_step_q4, int w, int h) {
  int x, y;
  src -= SUBPEL_TAPS / 2 - 1;
  for (y = 0; y < h; ++y) {
@@ -53,10 +53,9 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-        sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
      x_q4 += x_step_q4;
    }
    src += src_stride;
@@ -66,8 +65,8 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const InterpKernel *y_filters,
+                          const InterpKernel *y_filters, int y0_q4,
-                          int y0_q4, int y_step_q4, int w, int h) {
+                          int y_step_q4, int w, int h) {
  int x, y;
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
@@ -89,8 +88,8 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *y_filters,
+                              const InterpKernel *y_filters, int y0_q4,
-                              int y0_q4, int y_step_q4, int w, int h) {
+                              int y_step_q4, int w, int h) {
  int x, y;
  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
@@ -102,8 +101,10 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
      int k, sum = 0;
      for (k = 0; k < SUBPEL_TAPS; ++k)
        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+          dst[y * dst_stride] +
              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
          1);
      y_q4 += y_step_q4;
    }
    ++src;
@@ -111,13 +112,11 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
  }
 }
-static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     uint8_t *dst, ptrdiff_t dst_stride,
+                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
                     const InterpKernel *const x_filters,
                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters,
+                     const InterpKernel *const y_filters, int y0_q4,
-                     int y0_q4, int y_step_q4,
+                     int y_step_q4, int w, int h) {
                     int w, int h) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -132,7 +131,7 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
-          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
@@ -140,12 +139,11 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
  assert(y_step_q4 <= 32);
  assert(x_step_q4 <= 32);
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
-                 temp, MAX_SB_SIZE,
+                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
+                 intermediate_height);
-  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
+  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
-                dst, dst_stride,
+                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
                y_filters, y0_q4, y_step_q4, w, h);
 }
 static const InterpKernel *get_filter_base(const int16_t *filter) {
@@ -158,70 +156,69 @@ static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
  return (int)((const InterpKernel *)(intptr_t)f - base);
 }
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
+                           const int16_t *filter_y, int y_step_q4, int w,
-                           int w, int h) {
+                           int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 x0_q4, x_step_q4, w, h);
+                 w, h);
 }
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
-                               int w, int h) {
+                               int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x0_q4, x_step_q4, w, h);
+                     x_step_q4, w, h);
 }
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
+                          const int16_t *filter_y, int y_step_q4, int w,
-                          int w, int h) {
+                          int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                y0_q4, y_step_q4, w, h);
+                w, h);
 }
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
-                              int w, int h) {
+                              int h) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y0_q4, y_step_q4, w, h);
+                    y_step_q4, w, h);
 }
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     uint8_t *dst, ptrdiff_t dst_stride,
+                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     const int16_t *filter_x, int x_step_q4,
+                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
@@ -229,36 +226,35 @@ void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  convolve(src, src_stride, dst, dst_stride,
+  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
           filters_x, x0_q4, x_step_q4,
           filters_y, y0_q4, y_step_q4, w, h);
 }
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         uint8_t *dst, ptrdiff_t dst_stride,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         const int16_t *filter_x, int x_step_q4,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
  /* Fixed size intermediate buffer places limits on parameters. */
  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
-  vpx_convolve8_c(src, src_stride, temp, MAX_SB_SIZE,
+  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
-                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+                  filter_y, y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride,
+  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
-                     NULL, 0, NULL, 0, w, h);
+                     h);
 }
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         uint8_t *dst, ptrdiff_t dst_stride,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         const int16_t *filter_x, int filter_x_stride,
+                         int filter_x_stride, const int16_t *filter_y,
-                         const int16_t *filter_y, int filter_y_stride,
+                         int filter_y_stride, int w, int h) {
                         int w, int h) {
  int r;
-  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_x;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
  for (r = h; r > 0; --r) {
    memcpy(dst, src, w);
@@ -267,85 +263,80 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
  }
 }
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        uint8_t *dst, ptrdiff_t dst_stride,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        const int16_t *filter_x, int filter_x_stride,
+                        int filter_x_stride, const int16_t *filter_y,
-                        const int16_t *filter_y, int filter_y_stride,
+                        int filter_y_stride, int w, int h) {
                        int w, int h) {
  int x, y;
-  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_x;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x)
+    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
      dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
    src += src_stride;
    dst += dst_stride;
  }
 }
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        uint8_t *dst, ptrdiff_t dst_stride,
+                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        const int16_t *filter_x, int x_step_q4,
+                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
                        const int16_t *filter_y, int y_step_q4,
                        int w, int h) {
-  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                        filter_y, y_step_q4, w, h);
 }
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       uint8_t *dst, ptrdiff_t dst_stride,
+                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       const int16_t *filter_x, int x_step_q4,
+                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h) {
-  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                       filter_y, y_step_q4, w, h);
 }
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     uint8_t *dst, ptrdiff_t dst_stride,
+                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     const int16_t *filter_x, int x_step_q4,
+                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
                     const int16_t *filter_y, int y_step_q4,
                     int w, int h) {
-  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
+  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                  filter_y, y_step_q4, w, h);
 }
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
-                            int w, int h) {
+                            int h) {
-  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
                            x_step_q4, filter_y, y_step_q4, w, h);
 }
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4,
+                           const int16_t *filter_y, int y_step_q4, int w,
-                           int w, int h) {
+                           int h) {
-  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
+  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
                           x_step_q4, filter_y, y_step_q4, w, h);
 }
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     uint8_t *dst, ptrdiff_t dst_stride,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                     const int16_t *filter_x, int x_step_q4,
+                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     const int16_t *filter_y, int y_step_q4,
+                         int w, int h) {
-                     int w, int h) {
+  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
                      filter_y, y_step_q4, w, h);
 }
-#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_AOM_HIGHBITDEPTH
 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const InterpKernel *x_filters,
+                                  const InterpKernel *x_filters, int x0_q4,
-                                  int x0_q4, int x_step_q4,
+                                  int x_step_q4, int w, int h, int bd) {
                                  int w, int h, int bd) {
  int x, y;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -356,8 +347,7 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
        sum += src_x[k] * x_filter[k];
      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
      x_q4 += x_step_q4;
    }
@@ -368,9 +358,8 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
 static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                      uint8_t *dst8, ptrdiff_t dst_stride,
-                                      const InterpKernel *x_filters,
+                                      const InterpKernel *x_filters, int x0_q4,
-                                      int x0_q4, int x_step_q4,
+                                      int x_step_q4, int w, int h, int bd) {
                                      int w, int h, int bd) {
  int x, y;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -381,10 +370,10 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
      int k, sum = 0;
-      for (k = 0; k < SUBPEL_TAPS; ++k)
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
-        sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
-      dst[x] = ROUND_POWER_OF_TWO(dst[x] +
+          dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+          1);
      x_q4 += x_step_q4;
    }
    src += src_stride;
@@ -394,9 +383,8 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                 uint8_t *dst8, ptrdiff_t dst_stride,
-                                 const InterpKernel *y_filters,
+                                 const InterpKernel *y_filters, int y0_q4,
-                                 int y0_q4, int y_step_q4, int w, int h,
+                                 int y_step_q4, int w, int h, int bd) {
                                 int bd) {
  int x, y;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -409,8 +397,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
      int k, sum = 0;
      for (k = 0; k < SUBPEL_TAPS; ++k)
        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = clip_pixel_highbd(
+      dst[y * dst_stride] =
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
      y_q4 += y_step_q4;
    }
    ++src;
@@ -420,9 +408,8 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
 static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
                                     uint8_t *dst8, ptrdiff_t dst_stride,
-                                     const InterpKernel *y_filters,
+                                     const InterpKernel *y_filters, int y0_q4,
-                                     int y0_q4, int y_step_q4, int w, int h,
+                                     int y_step_q4, int w, int h, int bd) {
                                     int bd) {
  int x, y;
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -435,8 +422,10 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
      int k, sum = 0;
      for (k = 0; k < SUBPEL_TAPS; ++k)
        sum += src_y[k * src_stride] * y_filter[k];
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1);
+          dst[y * dst_stride] +
              clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
          1);
      y_q4 += y_step_q4;
    }
    ++src;
@@ -446,11 +435,9 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
 static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters,
+                            const InterpKernel *const x_filters, int x0_q4,
-                            int x0_q4, int x_step_q4,
+                            int x_step_q4, const InterpKernel *const y_filters,
-                            const InterpKernel *const y_filters,
+                            int y0_q4, int y_step_q4, int w, int h, int bd) {
                            int y0_q4, int y_step_q4,
                            int w, int h, int bd) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -465,7 +452,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
  int intermediate_height =
-          (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
@@ -473,31 +460,28 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
  assert(x_step_q4 <= 32);
  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
+                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
-                        x_filters, x0_q4, x_step_q4, w,
+                        x_step_q4, w, intermediate_height, bd);
                        intermediate_height, bd);
  highbd_convolve_vert(
-    CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
+      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-    dst, dst_stride,
+      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
    y_filters, y0_q4, y_step_q4, w, h, bd);
 }
-
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
 void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int w, int h, int bd) {
+                                  int h, int bd) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  (void)filter_y;
  (void)y_step_q4;
-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                        x0_q4, x_step_q4, w, h, bd);
+                        x_step_q4, w, h, bd);
 }
-void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                      uint8_t *dst, ptrdiff_t dst_stride,
                                      const int16_t *filter_x, int x_step_q4,
                                      const int16_t *filter_y, int y_step_q4,
@@ -507,25 +491,25 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
  (void)filter_y;
  (void)y_step_q4;
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                            x0_q4, x_step_q4, w, h, bd);
+                            x_step_q4, w, h, bd);
 }
-void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
+                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int w, int h, int bd) {
+                                 int h, int bd) {
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
  (void)filter_x;
  (void)x_step_q4;
-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                       y0_q4, y_step_q4, w, h, bd);
+                       y_step_q4, w, h, bd);
 }
-void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int x_step_q4,
                                     const int16_t *filter_y, int y_step_q4,
@@ -535,45 +519,42 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
  (void)filter_x;
  (void)x_step_q4;
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                           y0_q4, y_step_q4, w, h, bd);
+                           y_step_q4, w, h, bd);
 }
-void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
-                            int w, int h, int bd) {
+                            int h, int bd) {
  const InterpKernel *const filters_x = get_filter_base(filter_x);
  const int x0_q4 = get_filter_offset(filter_x, filters_x);
  const InterpKernel *const filters_y = get_filter_base(filter_y);
  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  highbd_convolve(src, src_stride, dst, dst_stride,
+  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
                  filters_x, x0_q4, x_step_q4,
                  filters_y, y0_q4, y_step_q4, w, h, bd);
 }
-void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
+                                const int16_t *filter_y, int y_step_q4, int w,
-                                int w, int h, int bd) {
+                                int h, int bd) {
  // Fixed size intermediate buffer places limits on parameters.
  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
  assert(w <= MAX_SB_SIZE);
  assert(h <= MAX_SB_SIZE);
-  vpx_highbd_convolve8_c(src, src_stride,
+  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                         CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
+  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
-                            dst, dst_stride,
+                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
                            NULL, 0, NULL, 0, w, h, bd);
 }
-void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
                                uint8_t *dst8, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int filter_x_stride,
                                const int16_t *filter_y, int filter_y_stride,
@@ -594,7 +575,7 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
  }
 }
-void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
                               uint8_t *dst8, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int filter_x_stride,
                               const int16_t *filter_y, int filter_y_stride,
--- a/aom_dsp/aom_convolve.h
+++ b/aom_dsp/aom_convolve.h
@@ -1,17 +1,18 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
-#ifndef VPX_DSP_VPX_CONVOLVE_H_
+#ifndef AOM_DSP_AOM_CONVOLVE_H_
-#define VPX_DSP_VPX_CONVOLVE_H_
+#define AOM_DSP_AOM_CONVOLVE_H_
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
 #ifdef __cplusplus
 extern "C" {
@@ -29,19 +30,19 @@ extern "C" {
 // --Must round-up because block may be located at sub-pixel position.
 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
-# define MAX_EXT_SIZE 263
+#define MAX_EXT_SIZE 263
 #else
-# define MAX_EXT_SIZE 135
+#define MAX_EXT_SIZE 135
-#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
-                              int w, int h);
+                              int h);
-#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_AOM_HIGHBITDEPTH
 typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                                     uint8_t *dst, ptrdiff_t dst_stride,
                                     const int16_t *filter_x, int x_step_q4,
@@ -53,4 +54,4 @@ typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
 }  // extern "C"
 #endif
-#endif  // VPX_DSP_VPX_CONVOLVE_H_
+#endif  // AOM_DSP_AOM_CONVOLVE_H_
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -1,15 +1,17 @@
 ##
-## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ##
-##  Use of this source code is governed by a BSD-style license
+## This source code is subject to the terms of the BSD 2 Clause License and
-##  that can be found in the LICENSE file in the root of the source
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-##  tree. An additional intellectual property rights grant can be found
+## was not distributed with this source code in the LICENSE file, you can
-##  in the file PATENTS.  All contributing project authors may
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-##  be found in the AUTHORS file in the root of the source tree.
+## Media Patent License 1.0 was not distributed with this source code in the
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
-DSP_SRCS-yes += vpx_dsp.mk
+
-DSP_SRCS-yes += vpx_dsp_common.h
+DSP_SRCS-yes += aom_dsp.mk
 DSP_SRCS-yes += aom_dsp_common.h
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
@@ -18,14 +20,20 @@ DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/synonyms.h
 # bit reader
 DSP_SRCS-yes += prob.h
 DSP_SRCS-yes += prob.c
 DSP_SRCS-$(CONFIG_ANS) += ans.h
 DSP_SRCS-$(CONFIG_ANS) += ans.c
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-$(CONFIG_ANS) += answriter.h
 DSP_SRCS-yes += bitwriter.h
-DSP_SRCS-yes += bitwriter.c
+DSP_SRCS-yes += dkboolwriter.h
 DSP_SRCS-yes += dkboolwriter.c
 DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
 DSP_SRCS-yes += psnr.c
 DSP_SRCS-yes += psnr.h
 DSP_SRCS-$(CONFIG_ANS) += buf_ans.h
 DSP_SRCS-$(CONFIG_ANS) += buf_ans.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
@@ -33,8 +41,10 @@ DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
 endif
 ifeq ($(CONFIG_DECODERS),yes)
 DSP_SRCS-$(CONFIG_ANS) += ansreader.h
 DSP_SRCS-yes += bitreader.h
-DSP_SRCS-yes += bitreader.c
+DSP_SRCS-yes += dkboolreader.h
 DSP_SRCS-yes += dkboolreader.c
 DSP_SRCS-yes += bitreader_buffer.c
 DSP_SRCS-yes += bitreader_buffer.h
 endif
@@ -42,15 +52,28 @@ endif
 # intra predictions
 DSP_SRCS-yes += intrapred.c
 ifeq ($(CONFIG_DAALA_EC),yes)
 DSP_SRCS-yes += entenc.c
 DSP_SRCS-yes += entenc.h
 DSP_SRCS-yes += entdec.c
 DSP_SRCS-yes += entdec.h
 DSP_SRCS-yes += entcode.c
 DSP_SRCS-yes += entcode.h
 DSP_SRCS-yes += daalaboolreader.c
 DSP_SRCS-yes += daalaboolreader.h
 DSP_SRCS-yes += daalaboolwriter.c
 DSP_SRCS-yes += daalaboolwriter.h
 endif
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
-endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_AOM_HIGHBITDEPTH
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
@@ -63,8 +86,6 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
 # inter predictions
 ifeq ($(CONFIG_VP10),yes)
 DSP_SRCS-yes            += blend.h
 DSP_SRCS-yes            += blend_a64_mask.c
 DSP_SRCS-yes            += blend_a64_hmask.c
@@ -73,54 +94,52 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
 endif  #CONFIG_VP10
 # interpolation filters
-DSP_SRCS-yes += vpx_convolve.c
+DSP_SRCS-yes += aom_convolve.c
-DSP_SRCS-yes += vpx_convolve.h
+DSP_SRCS-yes += aom_convolve.h
-DSP_SRCS-yes += vpx_filter.h
+DSP_SRCS-yes += aom_filter.h
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c
-DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_8t_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_subpixel_bilinear_sse2.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm
-DSP_SRCS-$(HAVE_AVX2)  += x86/vpx_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_AVX2)  += x86/aom_subpixel_8t_intrin_avx2.c
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_8t_sse2.asm
-DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_high_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_bilinear_sse2.asm
 endif
-
+DSP_SRCS-$(HAVE_SSE2)  += x86/aom_convolve_copy_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
 ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve_neon.c
+DSP_SRCS-yes += arm/aom_convolve_neon.c
 else
 ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
+DSP_SRCS-yes += arm/aom_convolve_copy_neon.c
-DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c
-DSP_SRCS-yes += arm/vpx_convolve8_neon.c
+DSP_SRCS-yes += arm/aom_convolve8_neon.c
-DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
+DSP_SRCS-yes += arm/aom_convolve_avg_neon.c
-DSP_SRCS-yes += arm/vpx_convolve_neon.c
+DSP_SRCS-yes += arm/aom_convolve_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 # common (msa)
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c
-DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h
 # common (dspr2)
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
@@ -167,15 +186,37 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
-endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_AOM_HIGHBITDEPTH
 DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-yes            += x86/txfm_common_intrin.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
 # forward transform
-ifeq ($(CONFIG_VP10),yes)
+ifeq ($(CONFIG_AV1),yes)
 DSP_SRCS-yes            += fwd_txfm.c
 DSP_SRCS-yes            += fwd_txfm.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32_8cols_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.h
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/txfm_common_avx2.h
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
 endif  # CONFIG_AV1_ENCODER
 ifeq ($(CONFIG_PVQ),yes)
 DSP_SRCS-yes            += fwd_txfm.c
 DSP_SRCS-yes            += fwd_txfm.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
@@ -191,10 +232,10 @@ DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
-endif  # CONFIG_VP10_ENCODER
+endif  # CONFIG_PVQ
 # inverse transform
-ifeq ($(CONFIG_VP10), yes)
+ifeq ($(CONFIG_AV1), yes)
 DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
@@ -234,23 +275,23 @@ DSP_SRCS-$(HAVE_MSA)   += mips/idct8x8_msa.c
 DSP_SRCS-$(HAVE_MSA)   += mips/idct16x16_msa.c
 DSP_SRCS-$(HAVE_MSA)   += mips/idct32x32_msa.c
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifneq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
-endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_AOM_HIGHBITDEPTH
-endif  # CONFIG_VP10
+endif  # CONFIG_AV1
 # quantization
-ifneq ($(filter yes,$(CONFIG_VP10_ENCODER)),)
+ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
 ifeq ($(ARCH_X86_64),yes)
@@ -269,17 +310,17 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
 # high bit depth subtract
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
 endif
-endif  # CONFIG_VP10_ENCODER
+endif  # CONFIG_AV1_ENCODER
-ifeq ($(CONFIG_VP10_ENCODER),yes)
+ifeq ($(CONFIG_AV1_ENCODER),yes)
 DSP_SRCS-yes            += sum_squares.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
-endif # CONFIG_VP10_ENCODER
+endif # CONFIG_AV1_ENCODER
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
@@ -299,16 +340,16 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
-ifeq ($(CONFIG_VP10_ENCODER),yes)
+ifeq ($(CONFIG_AV1_ENCODER),yes)
 ifeq ($(CONFIG_EXT_INTER),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
 DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
 endif  #CONFIG_EXT_INTER
-ifeq ($(CONFIG_OBMC),yes)
+ifeq ($(CONFIG_MOTION_VAR),yes)
 DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
-endif  #CONFIG_OBMC
+endif  #CONFIG_MOTION_VAR
-endif  #CONFIG_VP10_ENCODER
+endif  #CONFIG_AV1_ENCODER
 DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
@@ -316,10 +357,10 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
-endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_AOM_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS
@@ -353,17 +394,33 @@ endif  # ARCH_X86_64
 DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
-endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_AOM_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
-DSP_SRCS-yes += vpx_dsp_rtcd.c
+DSP_SRCS-yes += aom_dsp_rtcd.c
-DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl
+DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
-$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl))
+DSP_SRCS-yes += aom_simd.c
 DSP_SRCS-yes += aom_simd.h
 DSP_SRCS-yes += aom_simd_inline.h
 DSP_SRCS-yes += simd/v64_intrinsics.h
 DSP_SRCS-yes += simd/v64_intrinsics_c.h
 DSP_SRCS-yes += simd/v128_intrinsics.h
 DSP_SRCS-yes += simd/v128_intrinsics_c.h
 DSP_SRCS-yes += simd/v256_intrinsics.h
 DSP_SRCS-yes += simd/v256_intrinsics_c.h
 DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
 DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
 DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
 DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
 DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
 DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
 $(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -0,0 +1,102 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_AOM_DSP_COMMON_H_
 #define AOM_DSP_AOM_DSP_COMMON_H_
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifndef MAX_SB_SIZE
 #if CONFIG_AV1 && CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE 128
 #else
 #define MAX_SB_SIZE 64
 #endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 #endif  // ndef MAX_SB_SIZE
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
 #define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
 #define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
 // These can be used to give a hint about branch outcomes.
 // This can have an effect, even if your target processor has a
 // good branch predictor, as these hints can affect basic block
 // ordering by the compiler.
 #ifdef __GNUC__
 #define LIKELY(v) __builtin_expect(v, 1)
 #define UNLIKELY(v) __builtin_expect(v, 0)
 #else
 #define LIKELY(v) (v)
 #define UNLIKELY(v) (v)
 #endif
 #define AOM_SWAP(type, a, b) \
  do {                       \
    type c = (b);            \
    b = a;                   \
    a = c;                   \
  } while (0)
 #if CONFIG_AOM_QM
 typedef uint16_t qm_val_t;
 #define AOM_QM_BITS 6
 #endif
 #if CONFIG_AOM_HIGHBITDEPTH
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
 // tran_high_t is the datatype used for intermediate transform stages.
 typedef int64_t tran_high_t;
 typedef int32_t tran_low_t;
 #else
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
 // tran_high_t is the datatype used for intermediate transform stages.
 typedef int32_t tran_high_t;
 typedef int16_t tran_low_t;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 static INLINE uint8_t clip_pixel(int val) {
  return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
 static INLINE int clamp(int value, int low, int high) {
  return value < low ? low : (value > high ? high : value);
 }
 static INLINE double fclamp(double value, double low, double high) {
  return value < low ? low : (value > high ? high : value);
 }
 #if CONFIG_AOM_HIGHBITDEPTH
 static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
  switch (bd) {
    case 8:
    default: return (uint16_t)clamp(val, 0, 255);
    case 10: return (uint16_t)clamp(val, 0, 1023);
    case 12: return (uint16_t)clamp(val, 0, 4095);
  }
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_DSP_AOM_DSP_COMMON_H_
--- a/aom_dsp/aom_dsp_rtcd.c
+++ b/aom_dsp/aom_dsp_rtcd.c
@@ -0,0 +1,16 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include "./aom_config.h"
 #define RTCD_C
 #include "./aom_dsp_rtcd.h"
 #include "aom_ports/aom_once.h"
 void aom_dsp_rtcd() { once(setup_rtcd_internal); }
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
--- a/aom_dsp/aom_filter.h
+++ b/aom_dsp/aom_filter.h
@@ -0,0 +1,43 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_AOM_FILTER_H_
 #define AOM_DSP_AOM_FILTER_H_
 #include "aom/aom_integer.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define FILTER_BITS 7
 #define SUBPEL_BITS 4
 #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
 #define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
 #define SUBPEL_TAPS 8
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 #define BIL_SUBPEL_BITS 3
 #define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
 // 2 tap bilinear filters
 static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_DSP_AOM_FILTER_H_
--- a/aom_dsp/aom_simd.c
+++ b/aom_dsp/aom_simd.c
@@ -0,0 +1,13 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 // Set to 1 to add some sanity checks in the fallback C code
 const int simd_check = 1;
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -0,0 +1,32 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_AOM_AOM_SIMD_H_
 #define AOM_DSP_AOM_AOM_SIMD_H_
 #include <stdint.h>
 #if defined(_WIN32)
 #include <intrin.h>
 #endif
 #include "./aom_config.h"
 #include "./aom_simd_inline.h"
 #if HAVE_NEON
 #include "simd/v256_intrinsics_arm.h"
 #elif HAVE_SSE2
 #include "simd/v256_intrinsics_x86.h"
 #else
 #include "simd/v256_intrinsics.h"
 #endif
 #endif  // AOM_DSP_AOM_AOM_SIMD_H_
--- a/aom_dsp/aom_simd_inline.h
+++ b/aom_dsp/aom_simd_inline.h
@@ -0,0 +1,21 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_AOM_SIMD_INLINE_H_
 #define AOM_DSP_AOM_SIMD_INLINE_H_
 #include "aom/aom_integer.h"
 #ifndef SIMD_INLINE
 #define SIMD_INLINE static AOM_FORCE_INLINE
 #endif
 #endif  // AOM_DSP_AOM_SIMD_INLINE_H_
--- a/aom_dsp/arm/aom_convolve8_avg_neon.c
+++ b/aom_dsp/arm/aom_convolve8_avg_neon.c
@@ -1,31 +1,27 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include <assert.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-#include "vpx_ports/mem.h"
+#include "aom_ports/mem.h"
-static INLINE int32x4_t MULTIPLY_BY_Q0(
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-    int16x4_t dsrc0,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
-    int16x4_t dsrc1,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
-    int16x4_t dsrc2,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
-    int16x4_t dsrc3,
+                                       int16x8_t q0s16) {
    int16x4_t dsrc4,
    int16x4_t dsrc5,
    int16x4_t dsrc6,
    int16x4_t dsrc7,
    int16x8_t q0s16) {
  int32x4_t qdst;
  int16x4_t d0s16, d1s16;
@@ -43,17 +39,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
  return qdst;
 }
-void vpx_convolve8_avg_horiz_neon(
+void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-    const uint8_t *src,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
-    ptrdiff_t src_stride,
+                                  const int16_t *filter_x, int x_step_q4,
-    uint8_t *dst,
+                                  const int16_t *filter_y,  // unused
-    ptrdiff_t dst_stride,
+                                  int y_step_q4,            // unused
-    const int16_t *filter_x,
+                                  int w, int h) {
    int x_step_q4,
    const int16_t *filter_y,  // unused
    int y_step_q4,            // unused
    int w,
    int h) {
  int width;
  const uint8_t *s;
  uint8_t *d;
@@ -74,9 +65,13 @@ void vpx_convolve8_avg_horiz_neon(
  assert(x_step_q4 == 16);
  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_y;
  q0s16 = vld1q_s16(filter_x);
-  src -= 3;  // adjust for taps
+  src -= 3;                // adjust for taps
  for (; h > 0; h -= 4) {  // loop_horiz_v
    s = src;
    d24u8 = vld1_u8(s);
@@ -90,8 +85,8 @@ void vpx_convolve8_avg_horiz_neon(
    q12u8 = vcombine_u8(d24u8, d25u8);
    q13u8 = vcombine_u8(d26u8, d27u8);
-    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+    q0x2u16 =
-                        vreinterpretq_u16_u8(q13u8));
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@@ -116,10 +111,8 @@ void vpx_convolve8_avg_horiz_neon(
    q9u16 = vcombine_u16(d17u16, d19u16);
    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));         // vmov 23 21
-    for (width = w;
+    for (width = w; width > 0; width -= 4, src += 4, dst += 4) {  // loop_horiz
         width > 0;
         width -= 4, src += 4, dst += 4) {  // loop_horiz
      s = src;
      d28u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
@@ -131,10 +124,10 @@ void vpx_convolve8_avg_horiz_neon(
      __builtin_prefetch(src + 64);
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+      d0x2u16 =
-                         vreinterpret_u16_u32(d31u32));
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+      d1x2u16 =
-                         vreinterpret_u16_u32(d30u32));
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
@@ -144,8 +137,8 @@ void vpx_convolve8_avg_horiz_neon(
      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+      q0x2u32 =
-                          vreinterpretq_u32_u8(q15u8));
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@@ -173,14 +166,14 @@ void vpx_convolve8_avg_horiz_neon(
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                              d18s16, d19s16, d23s16, d24s16, q0s16);
+                             d23s16, d24s16, q0s16);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                              d19s16, d23s16, d24s16, d26s16, q0s16);
+                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d23s16, d24s16, d26s16, d27s16, q0s16);
+                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+                              d27s16, d25s16, q0s16);
      __builtin_prefetch(src + 64 + src_stride * 3);
@@ -195,8 +188,7 @@ void vpx_convolve8_avg_horiz_neon(
      d2u8 = vqmovn_u16(q1u16);
      d3u8 = vqmovn_u16(q2u16);
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
                         vreinterpret_u16_u8(d3u8));
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                         vreinterpret_u32_u16(d0x2u16.val[1]));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@@ -231,17 +223,12 @@ void vpx_convolve8_avg_horiz_neon(
  return;
 }
-void vpx_convolve8_avg_vert_neon(
+void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-    const uint8_t *src,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
-    ptrdiff_t src_stride,
+                                 const int16_t *filter_x,  // unused
-    uint8_t *dst,
+                                 int x_step_q4,            // unused
-    ptrdiff_t dst_stride,
+                                 const int16_t *filter_y, int y_step_q4, int w,
-    const int16_t *filter_x,  // unused
+                                 int h) {
    int x_step_q4,            // unused
    const int16_t *filter_y,
    int y_step_q4,
    int w,
    int h) {
  int height;
  const uint8_t *s;
  uint8_t *d;
@@ -258,6 +245,10 @@ void vpx_convolve8_avg_vert_neon(
  assert(y_step_q4 == 16);
  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_x;
  src -= src_stride * 3;
  q0s16 = vld1q_s16(filter_y);
  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
@@ -277,8 +268,8 @@ void vpx_convolve8_avg_vert_neon(
    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
    s += src_stride;
-    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
@@ -319,20 +310,20 @@ void vpx_convolve8_avg_vert_neon(
      __builtin_prefetch(s);
      __builtin_prefetch(s + src_stride);
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                              d20s16, d21s16, d22s16, d24s16, q0s16);
+                             d22s16, d24s16, q0s16);
      __builtin_prefetch(s + src_stride * 2);
      __builtin_prefetch(s + src_stride * 3);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                              d21s16, d22s16, d24s16, d26s16, q0s16);
+                             d24s16, d26s16, q0s16);
      __builtin_prefetch(d);
      __builtin_prefetch(d + dst_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d22s16, d24s16, d26s16, d27s16, q0s16);
+                              d26s16, d27s16, q0s16);
      __builtin_prefetch(d + dst_stride * 2);
      __builtin_prefetch(d + dst_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+                              d27s16, d25s16, q0s16);
      d2u16 = vqrshrun_n_s32(q1s32, 7);
      d3u16 = vqrshrun_n_s32(q2s32, 7);
--- a/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm
@@ -1,11 +1,14 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
@@ -14,11 +17,11 @@
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
+    ; AV1_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
+    ; AV1_FILTER_SHIFT == 7
-    EXPORT  |vpx_convolve8_avg_horiz_neon|
+    EXPORT  |aom_convolve8_avg_horiz_neon|
-    EXPORT  |vpx_convolve8_avg_vert_neon|
+    EXPORT  |aom_convolve8_avg_vert_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -49,7 +52,7 @@
 ; sp[]int w
 ; sp[]int h
-|vpx_convolve8_avg_horiz_neon| PROC
+|aom_convolve8_avg_horiz_neon| PROC
    push            {r4-r10, lr}
    sub             r0, r0, #3              ; adjust for taps
@@ -72,7 +75,7 @@
    mov             r10, r6                 ; w loop counter
-vpx_convolve8_avg_loop_horiz_v
+aom_convolve8_avg_loop_horiz_v
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
@@ -95,7 +98,7 @@ vpx_convolve8_avg_loop_horiz_v
    add             r0, r0, #3
-vpx_convolve8_avg_loop_horiz
+aom_convolve8_avg_loop_horiz
    add             r5, r0, #64
    vld1.32         {d28[]}, [r0], r1
@@ -164,20 +167,20 @@ vpx_convolve8_avg_loop_horiz
    vmov            q9,  q13
    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_horiz
+    bgt             aom_convolve8_avg_loop_horiz
    ; outer loop
    mov             r6, r10                 ; restore w counter
    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_avg_loop_horiz_v
+    bgt aom_convolve8_avg_loop_horiz_v
    pop             {r4-r10, pc}
    ENDP
-|vpx_convolve8_avg_vert_neon| PROC
+|aom_convolve8_avg_vert_neon| PROC
    push            {r4-r8, lr}
    ; adjust for taps
@@ -193,7 +196,7 @@ vpx_convolve8_avg_loop_horiz
    lsl             r1, r1, #1
    lsl             r3, r3, #1
-vpx_convolve8_avg_loop_vert_h
+aom_convolve8_avg_loop_vert_h
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
@@ -213,7 +216,7 @@ vpx_convolve8_avg_loop_vert_h
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
-vpx_convolve8_avg_loop_vert
+aom_convolve8_avg_loop_vert
    ; always process a 4x4 block at a time
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
@@ -278,13 +281,13 @@ vpx_convolve8_avg_loop_vert
    vmov            d22, d25
    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_avg_loop_vert
+    bgt             aom_convolve8_avg_loop_vert
    ; outer loop
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_vert_h
+    bgt             aom_convolve8_avg_loop_vert_h
    pop             {r4-r8, pc}
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -1,31 +1,27 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include <assert.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-#include "vpx_ports/mem.h"
+#include "aom_ports/mem.h"
-static INLINE int32x4_t MULTIPLY_BY_Q0(
+static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
-    int16x4_t dsrc0,
+                                       int16x4_t dsrc2, int16x4_t dsrc3,
-    int16x4_t dsrc1,
+                                       int16x4_t dsrc4, int16x4_t dsrc5,
-    int16x4_t dsrc2,
+                                       int16x4_t dsrc6, int16x4_t dsrc7,
-    int16x4_t dsrc3,
+                                       int16x8_t q0s16) {
    int16x4_t dsrc4,
    int16x4_t dsrc5,
    int16x4_t dsrc6,
    int16x4_t dsrc7,
    int16x8_t q0s16) {
  int32x4_t qdst;
  int16x4_t d0s16, d1s16;
@@ -43,17 +39,12 @@ static INLINE int32x4_t MULTIPLY_BY_Q0(
  return qdst;
 }
-void vpx_convolve8_horiz_neon(
+void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-    const uint8_t *src,
+                              uint8_t *dst, ptrdiff_t dst_stride,
-    ptrdiff_t src_stride,
+                              const int16_t *filter_x, int x_step_q4,
-    uint8_t *dst,
+                              const int16_t *filter_y,  // unused
-    ptrdiff_t dst_stride,
+                              int y_step_q4,            // unused
-    const int16_t *filter_x,
+                              int w, int h) {
    int x_step_q4,
    const int16_t *filter_y,  // unused
    int y_step_q4,            // unused
    int w,
    int h) {
  int width;
  const uint8_t *s, *psrc;
  uint8_t *d, *pdst;
@@ -74,12 +65,15 @@ void vpx_convolve8_horiz_neon(
  assert(x_step_q4 == 16);
  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_y;
  q0s16 = vld1q_s16(filter_x);
  src -= 3;  // adjust for taps
-  for (; h > 0; h -= 4,
+  for (; h > 0; h -= 4, src += src_stride * 4,
-    src += src_stride * 4,
+                dst += dst_stride * 4) {  // loop_horiz_v
    dst += dst_stride * 4) {  // loop_horiz_v
    s = src;
    d24u8 = vld1_u8(s);
    s += src_stride;
@@ -92,8 +86,8 @@ void vpx_convolve8_horiz_neon(
    q12u8 = vcombine_u8(d24u8, d25u8);
    q13u8 = vcombine_u8(d26u8, d27u8);
-    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+    q0x2u16 =
-                        vreinterpretq_u16_u8(q13u8));
+        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
@@ -105,8 +99,8 @@ void vpx_convolve8_horiz_neon(
    __builtin_prefetch(src + src_stride * 5);
    __builtin_prefetch(src + src_stride * 6);
-    q8u16  = vmovl_u8(d0x2u8.val[0]);
+    q8u16 = vmovl_u8(d0x2u8.val[0]);
-    q9u16  = vmovl_u8(d0x2u8.val[1]);
+    q9u16 = vmovl_u8(d0x2u8.val[1]);
    q10u16 = vmovl_u8(d1x2u8.val[0]);
    q11u16 = vmovl_u8(d1x2u8.val[1]);
@@ -119,8 +113,7 @@ void vpx_convolve8_horiz_neon(
    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
-    for (width = w, psrc = src + 7, pdst = dst;
+    for (width = w, psrc = src + 7, pdst = dst; width > 0;
         width > 0;
         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
      s = psrc;
      d28u32 = vld1_dup_u32((const uint32_t *)s);
@@ -133,10 +126,10 @@ void vpx_convolve8_horiz_neon(
      __builtin_prefetch(psrc + 64);
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+      d0x2u16 =
-                         vreinterpret_u16_u32(d31u32));
+          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
-      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+      d1x2u16 =
-                         vreinterpret_u16_u32(d30u32));
+          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
@@ -146,8 +139,8 @@ void vpx_convolve8_horiz_neon(
      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
-      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+      q0x2u32 =
-                          vreinterpretq_u32_u8(q15u8));
+          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
@@ -166,14 +159,14 @@ void vpx_convolve8_horiz_neon(
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
-                              d18s16, d19s16, d23s16, d24s16, q0s16);
+                             d23s16, d24s16, q0s16);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
-                              d19s16, d23s16, d24s16, d26s16, q0s16);
+                             d24s16, d26s16, q0s16);
-      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
-                              d23s16, d24s16, d26s16, d27s16, q0s16);
+                              d26s16, d27s16, q0s16);
-      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+                              d27s16, d25s16, q0s16);
      __builtin_prefetch(psrc + 60 + src_stride * 3);
@@ -188,8 +181,7 @@ void vpx_convolve8_horiz_neon(
      d2u8 = vqmovn_u16(q1u16);
      d3u8 = vqmovn_u16(q2u16);
-      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
                         vreinterpret_u16_u8(d3u8));
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                         vreinterpret_u32_u16(d0x2u16.val[1]));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
@@ -217,17 +209,12 @@ void vpx_convolve8_horiz_neon(
  return;
 }
-void vpx_convolve8_vert_neon(
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-    const uint8_t *src,
+                             uint8_t *dst, ptrdiff_t dst_stride,
-    ptrdiff_t src_stride,
+                             const int16_t *filter_x,  // unused
-    uint8_t *dst,
+                             int x_step_q4,            // unused
-    ptrdiff_t dst_stride,
+                             const int16_t *filter_y, int y_step_q4, int w,
-    const int16_t *filter_x,  // unused
+                             int h) {
    int x_step_q4,            // unused
    const int16_t *filter_y,
    int y_step_q4,
    int w,
    int h) {
  int height;
  const uint8_t *s;
  uint8_t *d;
@@ -242,6 +229,10 @@ void vpx_convolve8_vert_neon(
  assert(y_step_q4 == 16);
  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_x;
  src -= src_stride * 3;
  q0s16 = vld1q_s16(filter_y);
  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
@@ -261,8 +252,8 @@ void vpx_convolve8_vert_neon(
    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
    s += src_stride;
-    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+    q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
-    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+    q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
@@ -294,20 +285,20 @@ void vpx_convolve8_vert_neon(
      __builtin_prefetch(d);
      __builtin_prefetch(d + dst_stride);
-      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
-                              d20s16, d21s16, d22s16, d24s16, q0s16);
+                             d22s16, d24s16, q0s16);
      __builtin_prefetch(d + dst_stride * 2);
      __builtin_prefetch(d + dst_stride * 3);
-      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+      q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
-                              d21s16, d22s16, d24s16, d26s16, q0s16);
+                             d24s16, d26s16, q0s16);
      __builtin_prefetch(s);
      __builtin_prefetch(s + src_stride);
-      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
-                              d22s16, d24s16, d26s16, d27s16, q0s16);
+                              d26s16, d27s16, q0s16);
      __builtin_prefetch(s + src_stride * 2);
      __builtin_prefetch(s + src_stride * 3);
-      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
-                              d24s16, d26s16, d27s16, d25s16, q0s16);
+                              d27s16, d25s16, q0s16);
      d2u16 = vqrshrun_n_s32(q1s32, 7);
      d3u16 = vqrshrun_n_s32(q2s32, 7);
--- a/aom_dsp/arm/aom_convolve8_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve8_neon_asm.asm
@@ -1,11 +1,14 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
@@ -14,11 +17,11 @@
    ; w%4 == 0
    ; h%4 == 0
    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
+    ; AV1_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
+    ; AV1_FILTER_SHIFT == 7
-    EXPORT  |vpx_convolve8_horiz_neon|
+    EXPORT  |aom_convolve8_horiz_neon|
-    EXPORT  |vpx_convolve8_vert_neon|
+    EXPORT  |aom_convolve8_vert_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -49,7 +52,7 @@
 ; sp[]int w
 ; sp[]int h
-|vpx_convolve8_horiz_neon| PROC
+|aom_convolve8_horiz_neon| PROC
    push            {r4-r10, lr}
    sub             r0, r0, #3              ; adjust for taps
@@ -72,7 +75,7 @@
    mov             r10, r6                 ; w loop counter
-vpx_convolve8_loop_horiz_v
+aom_convolve8_loop_horiz_v
    vld1.8          {d24}, [r0], r1
    vld1.8          {d25}, [r0], r1
    vld1.8          {d26}, [r0], r1
@@ -95,7 +98,7 @@ vpx_convolve8_loop_horiz_v
    add             r0, r0, #3
-vpx_convolve8_loop_horiz
+aom_convolve8_loop_horiz
    add             r5, r0, #64
    vld1.32         {d28[]}, [r0], r1
@@ -153,20 +156,20 @@ vpx_convolve8_loop_horiz
    vmov            q9,  q13
    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_horiz
+    bgt             aom_convolve8_loop_horiz
    ; outer loop
    mov             r6, r10                 ; restore w counter
    add             r0, r0, r9              ; src += src_stride * 4 - w
    add             r2, r2, r12             ; dst += dst_stride * 4 - w
    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_loop_horiz_v
+    bgt aom_convolve8_loop_horiz_v
    pop             {r4-r10, pc}
    ENDP
-|vpx_convolve8_vert_neon| PROC
+|aom_convolve8_vert_neon| PROC
    push            {r4-r8, lr}
    ; adjust for taps
@@ -182,7 +185,7 @@ vpx_convolve8_loop_horiz
    lsl             r1, r1, #1
    lsl             r3, r3, #1
-vpx_convolve8_loop_vert_h
+aom_convolve8_loop_vert_h
    mov             r4, r0
    add             r7, r0, r1, asr #1
    mov             r5, r2
@@ -202,7 +205,7 @@ vpx_convolve8_loop_vert_h
    vmovl.u8        q10, d20
    vmovl.u8        q11, d22
-vpx_convolve8_loop_vert
+aom_convolve8_loop_vert
    ; always process a 4x4 block at a time
    vld1.u32        {d24[0]}, [r7], r1
    vld1.u32        {d26[0]}, [r4], r1
@@ -256,13 +259,13 @@ vpx_convolve8_loop_vert
    vmov            d22, d25
    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_loop_vert
+    bgt             aom_convolve8_loop_vert
    ; outer loop
    add             r0, r0, #4
    add             r2, r2, #4
    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_vert_h
+    bgt             aom_convolve8_loop_vert_h
    pop             {r4-r8, pc}
--- a/aom_dsp/arm/aom_convolve_avg_neon.c
+++ b/aom_dsp/arm/aom_convolve_avg_neon.c
@@ -1,46 +1,45 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-void vpx_convolve_avg_neon(
+void aom_convolve_avg_neon(const uint8_t *src,    // r0
-    const uint8_t *src,    // r0
+                           ptrdiff_t src_stride,  // r1
-    ptrdiff_t src_stride,  // r1
+                           uint8_t *dst,          // r2
-    uint8_t *dst,          // r2
+                           ptrdiff_t dst_stride,  // r3
-    ptrdiff_t dst_stride,  // r3
+                           const int16_t *filter_x, int filter_x_stride,
-    const int16_t *filter_x,
+                           const int16_t *filter_y, int filter_y_stride, int w,
-    int filter_x_stride,
+                           int h) {
    const int16_t *filter_y,
    int filter_y_stride,
    int w,
    int h) {
  uint8_t *d;
  uint8x8_t d0u8, d1u8, d2u8, d3u8;
  uint32x2_t d0u32, d2u32;
  uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
-  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_x;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
  d = dst;
  if (w > 32) {  // avg64
    for (; h > 0; h -= 1) {
-      q0u8  = vld1q_u8(src);
+      q0u8 = vld1q_u8(src);
-      q1u8  = vld1q_u8(src + 16);
+      q1u8 = vld1q_u8(src + 16);
-      q2u8  = vld1q_u8(src + 32);
+      q2u8 = vld1q_u8(src + 32);
-      q3u8  = vld1q_u8(src + 48);
+      q3u8 = vld1q_u8(src + 48);
      src += src_stride;
-      q8u8  = vld1q_u8(d);
+      q8u8 = vld1q_u8(d);
-      q9u8  = vld1q_u8(d + 16);
+      q9u8 = vld1q_u8(d + 16);
      q10u8 = vld1q_u8(d + 32);
      q11u8 = vld1q_u8(d + 48);
      d += dst_stride;
@@ -133,8 +132,7 @@ void vpx_convolve_avg_neon(
      d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
      d += dst_stride;
-      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+      d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
                       vreinterpret_u8_u32(d2u32));
      d0u32 = vreinterpret_u32_u8(d0u8);
      vst1_lane_u32((uint32_t *)dst, d0u32, 0);
--- a/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve_avg_neon_asm.asm
@@ -1,21 +1,24 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_convolve_avg_neon|
+;
    EXPORT  |aom_convolve_avg_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-|vpx_convolve_avg_neon| PROC
+|aom_convolve_avg_neon| PROC
    push                {r4-r6, lr}
    ldrd                r4, r5, [sp, #32]
    mov                 r6, r2
--- a/aom_dsp/arm/aom_convolve_copy_neon.c
+++ b/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -1,33 +1,32 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-void vpx_convolve_copy_neon(
+void aom_convolve_copy_neon(const uint8_t *src,    // r0
-    const uint8_t *src,    // r0
+                            ptrdiff_t src_stride,  // r1
-    ptrdiff_t src_stride,  // r1
+                            uint8_t *dst,          // r2
-    uint8_t *dst,          // r2
+                            ptrdiff_t dst_stride,  // r3
-    ptrdiff_t dst_stride,  // r3
+                            const int16_t *filter_x, int filter_x_stride,
-    const int16_t *filter_x,
+                            const int16_t *filter_y, int filter_y_stride, int w,
-    int filter_x_stride,
+                            int h) {
    const int16_t *filter_y,
    int filter_y_stride,
    int w,
    int h) {
  uint8x8_t d0u8, d2u8;
  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  (void)filter_x;  (void)filter_x_stride;
+  (void)filter_x;
-  (void)filter_y;  (void)filter_y_stride;
+  (void)filter_x_stride;
  (void)filter_y;
  (void)filter_y_stride;
  if (w > 32) {  // copy64
    for (; h > 0; h--) {
--- a/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
+++ b/aom_dsp/arm/aom_convolve_copy_neon_asm.asm
@@ -1,21 +1,24 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_convolve_copy_neon|
+;
    EXPORT  |aom_convolve_copy_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-|vpx_convolve_copy_neon| PROC
+|aom_convolve_copy_neon| PROC
    push                {r4-r5, lr}
    ldrd                r4, r5, [sp, #28]
--- a/aom_dsp/arm/aom_convolve_neon.c
+++ b/aom_dsp/arm/aom_convolve_neon.c
@@ -0,0 +1,66 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <assert.h>
 #include "./aom_dsp_rtcd.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
                        int w, int h) {
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
   */
  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;
  assert(y_step_q4 == 16);
  assert(x_step_q4 == 16);
  /* Filter starting 3 lines back. The neon implementation will ignore the
   * given height and filter a multiple of 4 lines. Since this goes in to
   * the temp buffer which has lots of extra room and is subsequently discarded
   * this is safe if somewhat less than ideal.
   */
  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
                           x_step_q4, filter_y, y_step_q4, w,
                           intermediate_height);
  /* Step into the temp buffer 3 lines to get the actual frame data */
  aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
                          x_step_q4, filter_y, y_step_q4, w, h);
 }
 void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h) {
  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
  int intermediate_height = h + 7;
  assert(y_step_q4 == 16);
  assert(x_step_q4 == 16);
  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
   */
  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
                           x_step_q4, filter_y, y_step_q4, w,
                           intermediate_height);
  aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
                              x_step_q4, filter_y, y_step_q4, w, h);
 }
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -1,20 +1,21 @@
 /*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include <assert.h>
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
  const uint32x4_t a = vpaddlq_u16(v_16x8);
@@ -24,7 +25,7 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
  return vget_lane_u32(c, 0);
 }
-unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
+unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) {
  uint16x8_t v_sum;
  uint32x2_t v_s0 = vdup_n_u32(0);
  uint32x2_t v_s1 = vdup_n_u32(0);
@@ -36,7 +37,7 @@ unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
 }
-unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
+unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) {
  uint8x8_t v_s0 = vld1_u8(s);
  const uint8x8_t v_s1 = vld1_u8(s + p);
  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
@@ -64,7 +65,7 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_neon(const int16_t *coeff, int length) {
+int aom_satd_neon(const int16_t *coeff, int length) {
  const int16x4_t zero = vdup_n_s16(0);
  int32x4_t accum = vdupq_n_s32(0);
@@ -89,7 +90,7 @@ int vpx_satd_neon(const int16_t *coeff, int length) {
  }
 }
-void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                          const int ref_stride, const int height) {
  int i;
  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
@@ -142,7 +143,7 @@ void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
 }
-int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) {
  int i;
  uint16x8_t vec_sum = vdupq_n_u16(0);
@@ -158,7 +159,7 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
-int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+int aom_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
  int width = 4 << bwl;
  int32x4_t sse = vdupq_n_s32(0);
  int16x8_t total = vdupq_n_s16(0);
@@ -198,27 +199,24 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
  }
 }
-void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
+void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                         const uint8_t *b, int b_stride,
+                         int b_stride, int *min, int *max) {
                         int *min, int *max) {
  // Load and concatenate.
-  const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
+  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
-                                     vld1_u8(a + a_stride));
+  const uint8x16_t a23 =
-  const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
+      vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
-                                     vld1_u8(a + 3 * a_stride));
+  const uint8x16_t a45 =
-  const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
+      vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
-                                     vld1_u8(a + 5 * a_stride));
+  const uint8x16_t a67 =
-  const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
+      vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
                                     vld1_u8(a + 7 * a_stride));
-  const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
+  const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
-                                     vld1_u8(b + b_stride));
+  const uint8x16_t b23 =
-  const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
+      vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
-                                     vld1_u8(b + 3 * b_stride));
+  const uint8x16_t b45 =
-  const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
+      vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
-                                     vld1_u8(b + 5 * b_stride));
+  const uint8x16_t b67 =
-  const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
+      vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
                                     vld1_u8(b + 7 * b_stride));
  // Absolute difference.
  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
--- a/aom_dsp/arm/bilinear_filter_media.asm
+++ b/aom_dsp/arm/bilinear_filter_media.asm
@@ -1,16 +1,19 @@
 ;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
-    EXPORT  |vpx_filter_block2d_bil_first_pass_media|
+    EXPORT  |aom_filter_block2d_bil_first_pass_media|
-    EXPORT  |vpx_filter_block2d_bil_second_pass_media|
+    EXPORT  |aom_filter_block2d_bil_second_pass_media|
    AREA    |.text|, CODE, READONLY  ; name this block of code
@@ -20,13 +23,13 @@
 ; r2    unsigned int    src_pitch,
 ; r3    unsigned int    height,
 ; stack unsigned int    width,
-; stack const short    *vpx_filter
+; stack const short    *aom_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
-|vpx_filter_block2d_bil_first_pass_media| PROC
+|aom_filter_block2d_bil_first_pass_media| PROC
    stmdb   sp!, {r4 - r11, lr}
-    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r11, [sp, #40]                  ; aom_filter address
    ldr     r4, [sp, #36]                   ; width
    mov     r12, r3                         ; outer-loop counter
@@ -134,7 +137,7 @@
    ldmia   sp!, {r4 - r11, pc}
-    ENDP  ; |vpx_filter_block2d_bil_first_pass_media|
+    ENDP  ; |aom_filter_block2d_bil_first_pass_media|
 ;---------------------------------
@@ -143,12 +146,12 @@
 ; r2    int             dst_pitch,
 ; r3    unsigned int    height,
 ; stack unsigned int    width,
-; stack const short    *vpx_filter
+; stack const short    *aom_filter
 ;---------------------------------
-|vpx_filter_block2d_bil_second_pass_media| PROC
+|aom_filter_block2d_bil_second_pass_media| PROC
    stmdb   sp!, {r4 - r11, lr}
-    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r11, [sp, #40]                  ; aom_filter address
    ldr     r4, [sp, #36]                   ; width
    ldr     r5, [r11]                       ; load up filter coefficients
@@ -232,6 +235,6 @@
    bne     bil_height_loop_null_2nd
    ldmia   sp!, {r4 - r11, pc}
-    ENDP  ; |vpx_filter_block2d_second_pass_media|
+    ENDP  ; |aom_filter_block2d_second_pass_media|
    END
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c
@@ -1,19 +1,20 @@
 /*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "vpx_dsp/txfm_common.h"
+#include "aom_dsp/txfm_common.h"
-void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
  int i;
  // stage 1
  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -52,10 +53,10 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
    {
      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -131,14 +132,14 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
      // 14 15 16 17 54 55 56 57
      // 24 25 26 27 64 65 66 67
      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+      const int32x4x2_t r02_s32 =
-                                            vreinterpretq_s32_s16(out_2));
+          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+      const int32x4x2_t r13_s32 =
-                                            vreinterpretq_s32_s16(out_3));
+          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+      const int32x4x2_t r46_s32 =
-                                            vreinterpretq_s32_s16(out_6));
+          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+      const int32x4x2_t r57_s32 =
-                                            vreinterpretq_s32_s16(out_7));
+          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
      const int16x8x2_t r01_s16 =
          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
                    vreinterpretq_s16_s32(r13_s32.val[0]));
@@ -170,7 +171,7 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
    }
  }  // for
  {
-    // from vpx_dct_sse2.c
+    // from aom_dct_sse2.c
    // Post-condition (division by two)
    //    division of two 16 bits signed numbers using shifts
    //    n / 2 = (n - (n >> 15)) >> 1
@@ -202,7 +203,7 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
  }
 }
-void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
  int r;
  int16x8_t sum = vld1q_s16(&input[0]);
  for (r = 1; r < 8; ++r) {
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -10,11 +10,10 @@
 #include <arm_neon.h>
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                                 int16x8_t *a2, int16x8_t *a3,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
                                 int16x8_t *a4, int16x8_t *a5,
                                 int16x8_t *a6, int16x8_t *a7) {
  const int16x8_t b0 = vaddq_s16(*a0, *a1);
  const int16x8_t b1 = vsubq_s16(*a0, *a1);
@@ -47,9 +46,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
 // TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
 // reversing transpose order which may make it easier for the compiler to
 // reconcile the vtrn.64 moves.
-static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                         int16x8_t *a2, int16x8_t *a3,
+                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
                         int16x8_t *a4, int16x8_t *a5,
                         int16x8_t *a6, int16x8_t *a7) {
  // Swap 64 bit elements. Goes from:
  // a0: 00 01 02 03 04 05 06 07
@@ -91,14 +89,14 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
  // a1657_hi:
  // 12 13 28 29 44 45 60 61
  // 14 15 30 31 46 47 62 63
-  const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo),
+  const int32x4x2_t a0246_lo =
-                                         vreinterpretq_s32_s16(a26_lo));
+      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
-  const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo),
+  const int32x4x2_t a1357_lo =
-                                         vreinterpretq_s32_s16(a37_lo));
+      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
-  const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi),
+  const int32x4x2_t a0246_hi =
-                                         vreinterpretq_s32_s16(a26_hi));
+      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
-  const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi),
+  const int32x4x2_t a1357_hi =
-                                         vreinterpretq_s32_s16(a37_hi));
+      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
  // Swap 16 bit elements resulting in:
  // b0:
@@ -132,7 +130,7 @@ static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
  *a7 = b3.val[1];
 }
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
                           int16_t *coeff) {
  int16x8_t a0 = vld1q_s16(src_diff);
  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
@@ -161,19 +159,19 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
  vst1q_s16(coeff + 56, a7);
 }
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
                             int16_t *coeff) {
  int i;
  /* Rearrange 16x16 to 8x32 and remove stride.
   * Top left first. */
-  vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
  /* Top right. */
-  vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
  /* Bottom left. */
-  vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
  /* Bottom right. */
-  vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
  for (i = 0; i < 64; i += 8) {
    const int16x8_t a0 = vld1q_s16(coeff + 0);
--- a/aom_dsp/arm/idct16x16_1_add_neon.asm
+++ b/aom_dsp/arm/idct16x16_1_add_neon.asm
@@ -1,28 +1,31 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license and patent
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  grant that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. All contributing project authors may be found in the AUTHORS
+; was not distributed with this source code in the LICENSE file, you can
-;  file in the root of the source tree.
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_idct16x16_1_add_neon|
+
    EXPORT  |aom_idct16x16_1_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                    int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
-|vpx_idct16x16_1_add_neon| PROC
+|aom_idct16x16_1_add_neon| PROC
    ldrsh            r0, [r0]
    ; generate cospi_16_64 = 11585
@@ -193,6 +196,6 @@
    vst1.64          {d31}, [r12], r2
    bx               lr
-    ENDP             ; |vpx_idct16x16_1_add_neon|
+    ENDP             ; |aom_idct16x16_1_add_neon|
    END
--- a/aom_dsp/arm/idct16x16_1_add_neon.c
+++ b/aom_dsp/arm/idct16x16_1_add_neon.c
@@ -0,0 +1,59 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "aom_dsp/inv_txfm.h"
 #include "aom_ports/mem.h"
 void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x8_t d2u8, d3u8, d30u8, d31u8;
  uint64x1_t d2u64, d3u64, d4u64, d5u64;
  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
  int16x8_t q0s16;
  uint8_t *d1, *d2;
  int16_t i, j, a1;
  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  out = dct_const_round_shift(out * cospi_16_64);
  a1 = ROUND_POWER_OF_TWO(out, 6);
  q0s16 = vdupq_n_s16(a1);
  q0u16 = vreinterpretq_u16_s16(q0s16);
  for (d1 = d2 = dest, i = 0; i < 4; i++) {
    for (j = 0; j < 2; j++) {
      d2u64 = vld1_u64((const uint64_t *)d1);
      d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
      d1 += dest_stride;
      d4u64 = vld1_u64((const uint64_t *)d1);
      d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
      d1 += dest_stride;
      q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
      q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
      q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
      q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
      d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
      d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
      d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
      d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
      d2 += dest_stride;
      vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
      vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
      d2 += dest_stride;
    }
  }
  return;
 }
--- a/aom_dsp/arm/idct16x16_add_neon.asm
+++ b/aom_dsp/arm/idct16x16_add_neon.asm
@@ -1,17 +1,20 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_idct16x16_256_add_neon_pass1|
+;
-    EXPORT  |vpx_idct16x16_256_add_neon_pass2|
+
-    EXPORT  |vpx_idct16x16_10_add_neon_pass1|
+    EXPORT  |aom_idct16x16_256_add_neon_pass1|
-    EXPORT  |vpx_idct16x16_10_add_neon_pass2|
+    EXPORT  |aom_idct16x16_256_add_neon_pass2|
    EXPORT  |aom_idct16x16_10_add_neon_pass1|
    EXPORT  |aom_idct16x16_10_add_neon_pass2|
    ARM
    REQUIRE8
    PRESERVE8
@@ -36,7 +39,7 @@
    MEND
    AREA    Block, CODE, READONLY ; name this block of code
-;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input,
+;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input,
 ;                                          int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -46,7 +49,7 @@
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass1| PROC
+|aom_idct16x16_256_add_neon_pass1| PROC
    ; TODO(hkuang): Find a better way to load the elements.
    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -273,9 +276,9 @@
    vst1.64         {d31}, [r1], r2
    bx              lr
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
+    ENDP  ; |aom_idct16x16_256_add_neon_pass1|
-;void vpx_idct16x16_256_add_neon_pass2(int16_t *src,
+;void aom_idct16x16_256_add_neon_pass2(int16_t *src,
 ;                                        int16_t *output,
 ;                                        int16_t *pass1Output,
 ;                                        int16_t skip_adding,
@@ -292,7 +295,7 @@
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass2| PROC
+|aom_idct16x16_256_add_neon_pass2| PROC
    push            {r3-r9}
    ; TODO(hkuang): Find a better way to load the elements.
@@ -784,9 +787,9 @@ skip_adding_dest
 end_idct16x16_pass2
    pop             {r3-r9}
    bx              lr
-    ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
+    ENDP  ; |aom_idct16x16_256_add_neon_pass2|
-;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input,
+;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input,
 ;                                             int16_t *output, int output_stride)
 ;
 ; r0  int16_t input
@@ -796,7 +799,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass1| PROC
+|aom_idct16x16_10_add_neon_pass1| PROC
    ; TODO(hkuang): Find a better way to load the elements.
    ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -905,9 +908,9 @@ end_idct16x16_pass2
    vst1.64         {d31}, [r1], r2
    bx              lr
-    ENDP  ; |vpx_idct16x16_10_add_neon_pass1|
+    ENDP  ; |aom_idct16x16_10_add_neon_pass1|
-;void vpx_idct16x16_10_add_neon_pass2(int16_t *src,
+;void aom_idct16x16_10_add_neon_pass2(int16_t *src,
 ;                                           int16_t *output,
 ;                                           int16_t *pass1Output,
 ;                                           int16_t skip_adding,
@@ -924,7 +927,7 @@ end_idct16x16_pass2
 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass2| PROC
+|aom_idct16x16_10_add_neon_pass2| PROC
    push            {r3-r9}
    ; TODO(hkuang): Find a better way to load the elements.
@@ -1175,5 +1178,5 @@ end_idct16x16_pass2
 end_idct10_16x16_pass2
    pop             {r3-r9}
    bx              lr
-    ENDP  ; |vpx_idct16x16_10_add_neon_pass2|
+    ENDP  ; |aom_idct16x16_10_add_neon_pass2|
    END
--- a/aom_dsp/arm/idct16x16_add_neon.c
+++ b/aom_dsp/arm/idct16x16_add_neon.c
--- a/aom_dsp/arm/idct16x16_neon.c
+++ b/aom_dsp/arm/idct16x16_neon.c
@@ -0,0 +1,152 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include "aom_dsp/aom_dsp_common.h"
 void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
                                      int output_stride);
 void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
                                      int16_t *pass1Output, int16_t skip_adding,
                                      uint8_t *dest, int dest_stride);
 void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
                                     int output_stride);
 void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
                                     int16_t *pass1Output, int16_t skip_adding,
                                     uint8_t *dest, int dest_stride);
 #if HAVE_NEON_ASM
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
 extern void aom_push_neon(int64_t *store);
 extern void aom_pop_neon(int64_t *store);
 #endif  // HAVE_NEON_ASM
 void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
                                int dest_stride) {
 #if HAVE_NEON_ASM
  int64_t store_reg[8];
 #endif
  int16_t pass1_output[16 * 16] = { 0 };
  int16_t row_idct_output[16 * 16] = { 0 };
 #if HAVE_NEON_ASM
  // save d8-d15 register values.
  aom_push_neon(store_reg);
 #endif
  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
  aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
  aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
                                   dest, dest_stride);
  /* Parallel idct on the lower 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
  aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
  aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
                                   pass1_output, 0, dest, dest_stride);
  /* Parallel idct on the left 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
                                   pass1_output, 1, dest, dest_stride);
  /* Parallel idct on the right 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
                                   row_idct_output + 8, pass1_output, 1,
                                   dest + 8, dest_stride);
 #if HAVE_NEON_ASM
  // restore d8-d15 register values.
  aom_pop_neon(store_reg);
 #endif
  return;
 }
 void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
                               int dest_stride) {
 #if HAVE_NEON_ASM
  int64_t store_reg[8];
 #endif
  int16_t pass1_output[16 * 16] = { 0 };
  int16_t row_idct_output[16 * 16] = { 0 };
 #if HAVE_NEON_ASM
  // save d8-d15 register values.
  aom_push_neon(store_reg);
 #endif
  /* Parallel idct on the upper 8 rows */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
  aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7
  // which will be saved into row_idct_output.
  aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
                                  dest, dest_stride);
  /* Skip Parallel idct on the lower 8 rows as they are all 0s */
  /* Parallel idct on the left 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
  aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
                                   pass1_output, 1, dest, dest_stride);
  /* Parallel idct on the right 8 columns */
  // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
  // stage 6 result in pass1_output.
  aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
  // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
  // with result in pass1(pass1_output) to calculate final result in stage 7.
  // Then add the result to the destination data.
  aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
                                   row_idct_output + 8, pass1_output, 1,
                                   dest + 8, dest_stride);
 #if HAVE_NEON_ASM
  // restore d8-d15 register values.
  aom_pop_neon(store_reg);
 #endif
  return;
 }
--- a/aom_dsp/arm/idct32x32_1_add_neon.asm
+++ b/aom_dsp/arm/idct32x32_1_add_neon.asm
@@ -1,13 +1,16 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license and patent
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  grant that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. All contributing project authors may be found in the AUTHORS
+; was not distributed with this source code in the LICENSE file, you can
-;  file in the root of the source tree.
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_idct32x32_1_add_neon|
+
    EXPORT  |aom_idct32x32_1_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -64,14 +67,14 @@
    vst1.8           {q15},[$dst], $stride
    MEND
-;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
 ;                              int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride
-|vpx_idct32x32_1_add_neon| PROC
+|aom_idct32x32_1_add_neon| PROC
    push             {lr}
    pld              [r1]
    add              r3, r1, #16               ; r3 dest + 16 for second loop
@@ -140,5 +143,5 @@ diff_positive_32_32_loop
    bne              diff_positive_32_32_loop
    pop              {pc}
-    ENDP             ; |vpx_idct32x32_1_add_neon|
+    ENDP             ; |aom_idct32x32_1_add_neon|
    END
--- a/aom_dsp/arm/idct32x32_1_add_neon.c
+++ b/aom_dsp/arm/idct32x32_1_add_neon.c
@@ -0,0 +1,141 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_config.h"
 #include "aom_dsp/inv_txfm.h"
 #include "aom_ports/mem.h"
 static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
                           uint8x16_t *q9u8, uint8x16_t *q10u8,
                           uint8x16_t *q11u8, uint8x16_t *q12u8,
                           uint8x16_t *q13u8, uint8x16_t *q14u8,
                           uint8x16_t *q15u8) {
  *q8u8 = vld1q_u8(d);
  d += d_stride;
  *q9u8 = vld1q_u8(d);
  d += d_stride;
  *q10u8 = vld1q_u8(d);
  d += d_stride;
  *q11u8 = vld1q_u8(d);
  d += d_stride;
  *q12u8 = vld1q_u8(d);
  d += d_stride;
  *q13u8 = vld1q_u8(d);
  d += d_stride;
  *q14u8 = vld1q_u8(d);
  d += d_stride;
  *q15u8 = vld1q_u8(d);
  return;
 }
 static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
                                 uint8x16_t *q15u8) {
  *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
  *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
  *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
  *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
  *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
  *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
  *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
  *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
  return;
 }
 static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
                                 uint8x16_t *q9u8, uint8x16_t *q10u8,
                                 uint8x16_t *q11u8, uint8x16_t *q12u8,
                                 uint8x16_t *q13u8, uint8x16_t *q14u8,
                                 uint8x16_t *q15u8) {
  *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
  *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
  *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
  *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
  *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
  *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
  *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
  *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
  return;
 }
 static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
                           uint8x16_t *q9u8, uint8x16_t *q10u8,
                           uint8x16_t *q11u8, uint8x16_t *q12u8,
                           uint8x16_t *q13u8, uint8x16_t *q14u8,
                           uint8x16_t *q15u8) {
  vst1q_u8(d, *q8u8);
  d += d_stride;
  vst1q_u8(d, *q9u8);
  d += d_stride;
  vst1q_u8(d, *q10u8);
  d += d_stride;
  vst1q_u8(d, *q11u8);
  d += d_stride;
  vst1q_u8(d, *q12u8);
  d += d_stride;
  vst1q_u8(d, *q13u8);
  d += d_stride;
  vst1q_u8(d, *q14u8);
  d += d_stride;
  vst1q_u8(d, *q15u8);
  return;
 }
 void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
  int i, j, dest_stride8;
  uint8_t *d;
  int16_t a1;
  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  out = dct_const_round_shift(out * cospi_16_64);
  a1 = ROUND_POWER_OF_TWO(out, 6);
  dest_stride8 = dest_stride * 8;
  if (a1 >= 0) {  // diff_positive_32_32
    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
    q0u8 = vdupq_n_u8(a1);
    for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
      d = dest;
      for (j = 0; j < 4; j++) {
        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
                &q14u8, &q15u8);
        ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
                      &q14u8, &q15u8);
        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
                &q14u8, &q15u8);
        d += dest_stride8;
      }
    }
  } else {  // diff_negative_32_32
    a1 = -a1;
    a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
    q0u8 = vdupq_n_u8(a1);
    for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
      d = dest;
      for (j = 0; j < 4; j++) {
        LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
                &q14u8, &q15u8);
        SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
                      &q14u8, &q15u8);
        ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
                &q14u8, &q15u8);
        d += dest_stride8;
      }
    }
  }
  return;
 }
--- a/aom_dsp/arm/idct32x32_add_neon.asm
+++ b/aom_dsp/arm/idct32x32_add_neon.asm
@@ -1,11 +1,14 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
 ;TODO(cd): adjust these constant to be able to use vqdmulh for faster
@@ -43,7 +46,7 @@ cospi_30_64 EQU  1606
 cospi_31_64 EQU   804
-    EXPORT  |vpx_idct32x32_1024_add_neon|
+    EXPORT  |aom_idct32x32_1024_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -288,7 +291,7 @@ cospi_31_64 EQU   804
    MEND
    ; --------------------------------------------------------------------------
-;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 ;
 ;   r0  int16_t *input,
 ;   r1  uint8_t *dest,
@@ -303,7 +306,7 @@ cospi_31_64 EQU   804
 ;   r9  dest + 15 * dest_stride, descending (14, 13, 12, ...)
 ;   r10 dest + 16 * dest_stride, ascending  (17, 18, 19, ...)
-|vpx_idct32x32_1024_add_neon| PROC
+|aom_idct32x32_1024_add_neon| PROC
    ; This function does one pass of idct32x32 transform.
    ;
    ; This is done by transposing the input and then doing a 1d transform on
@@ -1295,5 +1298,5 @@ idct32_bands_end_2nd_pass
    vpop {d8-d15}
    pop  {r4-r11}
    bx              lr
-    ENDP  ; |vpx_idct32x32_1024_add_neon|
+    ENDP  ; |aom_idct32x32_1024_add_neon|
    END
--- a/aom_dsp/arm/idct32x32_add_neon.c
+++ b/aom_dsp/arm/idct32x32_add_neon.c
@@ -0,0 +1,686 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_config.h"
 #include "aom_dsp/txfm_common.h"
 #define LOAD_FROM_TRANSPOSED(prev, first, second) \
  q14s16 = vld1q_s16(trans_buf + first * 8);      \
  q13s16 = vld1q_s16(trans_buf + second * 8);
 #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
  qA = vld1q_s16(out + first * 32);                   \
  qB = vld1q_s16(out + second * 32);
 #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
  vst1q_s16(out + first * 32, qA);                   \
  vst1q_s16(out + second * 32, qB);
 #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
  __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
 static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
                                                  int stride, int16x8_t q6s16,
                                                  int16x8_t q7s16,
                                                  int16x8_t q8s16,
                                                  int16x8_t q9s16) {
  int16x4_t d8s16, d9s16, d10s16, d11s16;
  d8s16 = vld1_s16((int16_t *)p1);
  p1 += stride;
  d11s16 = vld1_s16((int16_t *)p2);
  p2 -= stride;
  d9s16 = vld1_s16((int16_t *)p1);
  d10s16 = vld1_s16((int16_t *)p2);
  q7s16 = vrshrq_n_s16(q7s16, 6);
  q8s16 = vrshrq_n_s16(q8s16, 6);
  q9s16 = vrshrq_n_s16(q9s16, 6);
  q6s16 = vrshrq_n_s16(q6s16, 6);
  q7s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
  q8s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
  q9s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
  q6s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
  d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
  d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
  d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
  d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
  vst1_s16((int16_t *)p1, d9s16);
  p1 -= stride;
  vst1_s16((int16_t *)p2, d10s16);
  p2 += stride;
  vst1_s16((int16_t *)p1, d8s16);
  vst1_s16((int16_t *)p2, d11s16);
  return;
 }
 #define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
  ;                                           \
  __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
 static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
                                                   int stride, int16x8_t q4s16,
                                                   int16x8_t q5s16,
                                                   int16x8_t q6s16,
                                                   int16x8_t q7s16) {
  int16x4_t d4s16, d5s16, d6s16, d7s16;
  d4s16 = vld1_s16((int16_t *)p1);
  p1 += stride;
  d7s16 = vld1_s16((int16_t *)p2);
  p2 -= stride;
  d5s16 = vld1_s16((int16_t *)p1);
  d6s16 = vld1_s16((int16_t *)p2);
  q5s16 = vrshrq_n_s16(q5s16, 6);
  q6s16 = vrshrq_n_s16(q6s16, 6);
  q7s16 = vrshrq_n_s16(q7s16, 6);
  q4s16 = vrshrq_n_s16(q4s16, 6);
  q5s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
  q6s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
  q7s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
  q4s16 = vreinterpretq_s16_u16(
      vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
  d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
  d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
  d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
  d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
  vst1_s16((int16_t *)p1, d5s16);
  p1 -= stride;
  vst1_s16((int16_t *)p2, d6s16);
  p2 += stride;
  vst1_s16((int16_t *)p2, d7s16);
  vst1_s16((int16_t *)p1, d4s16);
  return;
 }
 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
  DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
 static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
                                int16_t first_const, int16_t second_const,
                                int16x8_t *qAs16, int16x8_t *qBs16) {
  int16x4_t d30s16, d31s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
  int16x4_t dCs16, dDs16, dAs16, dBs16;
  dCs16 = vget_low_s16(q14s16);
  dDs16 = vget_high_s16(q14s16);
  dAs16 = vget_low_s16(q13s16);
  dBs16 = vget_high_s16(q13s16);
  d30s16 = vdup_n_s16(first_const);
  d31s16 = vdup_n_s16(second_const);
  q8s32 = vmull_s16(dCs16, d30s16);
  q10s32 = vmull_s16(dAs16, d31s16);
  q9s32 = vmull_s16(dDs16, d30s16);
  q11s32 = vmull_s16(dBs16, d31s16);
  q12s32 = vmull_s16(dCs16, d31s16);
  q8s32 = vsubq_s32(q8s32, q10s32);
  q9s32 = vsubq_s32(q9s32, q11s32);
  q10s32 = vmull_s16(dDs16, d31s16);
  q11s32 = vmull_s16(dAs16, d30s16);
  q15s32 = vmull_s16(dBs16, d30s16);
  q11s32 = vaddq_s32(q12s32, q11s32);
  q10s32 = vaddq_s32(q10s32, q15s32);
  *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
  *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
  return;
 }
 static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
  int16_t *in;
  int i;
  const int stride = 32;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
  for (i = 0; i < 4; i++, input += 8) {
    in = input;
    q8s16 = vld1q_s16(in);
    in += stride;
    q9s16 = vld1q_s16(in);
    in += stride;
    q10s16 = vld1q_s16(in);
    in += stride;
    q11s16 = vld1q_s16(in);
    in += stride;
    q12s16 = vld1q_s16(in);
    in += stride;
    q13s16 = vld1q_s16(in);
    in += stride;
    q14s16 = vld1q_s16(in);
    in += stride;
    q15s16 = vld1q_s16(in);
    d16s16 = vget_low_s16(q8s16);
    d17s16 = vget_high_s16(q8s16);
    d18s16 = vget_low_s16(q9s16);
    d19s16 = vget_high_s16(q9s16);
    d20s16 = vget_low_s16(q10s16);
    d21s16 = vget_high_s16(q10s16);
    d22s16 = vget_low_s16(q11s16);
    d23s16 = vget_high_s16(q11s16);
    d24s16 = vget_low_s16(q12s16);
    d25s16 = vget_high_s16(q12s16);
    d26s16 = vget_low_s16(q13s16);
    d27s16 = vget_high_s16(q13s16);
    d28s16 = vget_low_s16(q14s16);
    d29s16 = vget_high_s16(q14s16);
    d30s16 = vget_low_s16(q15s16);
    d31s16 = vget_high_s16(q15s16);
    q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
    q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
    q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
    q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
    q12s16 = vcombine_s16(d17s16, d25s16);
    q13s16 = vcombine_s16(d19s16, d27s16);
    q14s16 = vcombine_s16(d21s16, d29s16);
    q15s16 = vcombine_s16(d23s16, d31s16);
    q0x2s32 =
        vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
    q1x2s32 =
        vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
    q2x2s32 =
        vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
    q3x2s32 =
        vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
    vst1q_s16(t_buf, q0x2s16.val[0]);
    t_buf += 8;
    vst1q_s16(t_buf, q0x2s16.val[1]);
    t_buf += 8;
    vst1q_s16(t_buf, q1x2s16.val[0]);
    t_buf += 8;
    vst1q_s16(t_buf, q1x2s16.val[1]);
    t_buf += 8;
    vst1q_s16(t_buf, q2x2s16.val[0]);
    t_buf += 8;
    vst1q_s16(t_buf, q2x2s16.val[1]);
    t_buf += 8;
    vst1q_s16(t_buf, q3x2s16.val[0]);
    t_buf += 8;
    vst1q_s16(t_buf, q3x2s16.val[1]);
    t_buf += 8;
  }
  return;
 }
 static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
                                             int16x8_t q3s16, int16x8_t q6s16,
                                             int16x8_t q7s16, int16x8_t q8s16,
                                             int16x8_t q9s16, int16x8_t q10s16,
                                             int16x8_t q11s16, int16x8_t q12s16,
                                             int16x8_t q13s16, int16x8_t q14s16,
                                             int16x8_t q15s16) {
  int16x8_t q0s16, q1s16, q4s16, q5s16;
  STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
  STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
  LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
  STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
  LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
  q2s16 = vaddq_s16(q10s16, q1s16);
  q3s16 = vaddq_s16(q11s16, q0s16);
  q4s16 = vsubq_s16(q11s16, q0s16);
  q5s16 = vsubq_s16(q10s16, q1s16);
  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
  q8s16 = vaddq_s16(q4s16, q1s16);
  q9s16 = vaddq_s16(q5s16, q0s16);
  q6s16 = vsubq_s16(q5s16, q0s16);
  q7s16 = vsubq_s16(q4s16, q1s16);
  STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
  STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
  LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
  STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
  LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
  q2s16 = vaddq_s16(q12s16, q1s16);
  q3s16 = vaddq_s16(q13s16, q0s16);
  q4s16 = vsubq_s16(q13s16, q0s16);
  q5s16 = vsubq_s16(q12s16, q1s16);
  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
  q8s16 = vaddq_s16(q4s16, q1s16);
  q9s16 = vaddq_s16(q5s16, q0s16);
  q6s16 = vsubq_s16(q5s16, q0s16);
  q7s16 = vsubq_s16(q4s16, q1s16);
  STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
  STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
  LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
  STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
  LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
  q2s16 = vaddq_s16(q14s16, q1s16);
  q3s16 = vaddq_s16(q15s16, q0s16);
  q4s16 = vsubq_s16(q15s16, q0s16);
  q5s16 = vsubq_s16(q14s16, q1s16);
  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
  q8s16 = vaddq_s16(q4s16, q1s16);
  q9s16 = vaddq_s16(q5s16, q0s16);
  q6s16 = vsubq_s16(q5s16, q0s16);
  q7s16 = vsubq_s16(q4s16, q1s16);
  STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
  STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
  LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
  STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
  return;
 }
 static INLINE void idct32_bands_end_2nd_pass(
    int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
    int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
    int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
    int16x8_t q14s16, int16x8_t q15s16) {
  uint8_t *r6 = dest + 31 * stride;
  uint8_t *r7 = dest /* +  0 * stride*/;
  uint8_t *r9 = dest + 15 * stride;
  uint8_t *r10 = dest + 16 * stride;
  int str2 = stride << 1;
  int16x8_t q0s16, q1s16, q4s16, q5s16;
  STORE_COMBINE_CENTER_RESULTS(r10, r9);
  r10 += str2;
  r9 -= str2;
  LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  r7 += str2;
  r6 -= str2;
  LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
  q2s16 = vaddq_s16(q10s16, q1s16);
  q3s16 = vaddq_s16(q11s16, q0s16);
  q4s16 = vsubq_s16(q11s16, q0s16);
  q5s16 = vsubq_s16(q10s16, q1s16);
  LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
  q8s16 = vaddq_s16(q4s16, q1s16);
  q9s16 = vaddq_s16(q5s16, q0s16);
  q6s16 = vsubq_s16(q5s16, q0s16);
  q7s16 = vsubq_s16(q4s16, q1s16);
  STORE_COMBINE_CENTER_RESULTS(r10, r9);
  r10 += str2;
  r9 -= str2;
  LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  r7 += str2;
  r6 -= str2;
  LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
  q2s16 = vaddq_s16(q12s16, q1s16);
  q3s16 = vaddq_s16(q13s16, q0s16);
  q4s16 = vsubq_s16(q13s16, q0s16);
  q5s16 = vsubq_s16(q12s16, q1s16);
  LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
  q8s16 = vaddq_s16(q4s16, q1s16);
  q9s16 = vaddq_s16(q5s16, q0s16);
  q6s16 = vsubq_s16(q5s16, q0s16);
  q7s16 = vsubq_s16(q4s16, q1s16);
  STORE_COMBINE_CENTER_RESULTS(r10, r9);
  r10 += str2;
  r9 -= str2;
  LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  r7 += str2;
  r6 -= str2;
  LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
  q2s16 = vaddq_s16(q14s16, q1s16);
  q3s16 = vaddq_s16(q15s16, q0s16);
  q4s16 = vsubq_s16(q15s16, q0s16);
  q5s16 = vsubq_s16(q14s16, q1s16);
  LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
  q8s16 = vaddq_s16(q4s16, q1s16);
  q9s16 = vaddq_s16(q5s16, q0s16);
  q6s16 = vsubq_s16(q5s16, q0s16);
  q7s16 = vsubq_s16(q4s16, q1s16);
  STORE_COMBINE_CENTER_RESULTS(r10, r9);
  LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
  q4s16 = vaddq_s16(q2s16, q1s16);
  q5s16 = vaddq_s16(q3s16, q0s16);
  q6s16 = vsubq_s16(q3s16, q0s16);
  q7s16 = vsubq_s16(q2s16, q1s16);
  STORE_COMBINE_EXTREME_RESULTS(r7, r6);
  return;
 }
 void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
  int i, idct32_pass_loop;
  int16_t trans_buf[32 * 8];
  int16_t pass1[32 * 32];
  int16_t pass2[32 * 32];
  int16_t *out;
  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
       idct32_pass_loop++,
      input = pass1,  // the input of pass2 is the result of pass1
       out = pass2) {
    for (i = 0; i < 4; i++, input += 32 * 8, out += 8) {  // idct32_bands_loop
      idct32_transpose_pair(input, trans_buf);
      // -----------------------------------------
      // BLOCK A: 16-19,28-31
      // -----------------------------------------
      // generate 16,17,30,31
      // part of stage 1
      LOAD_FROM_TRANSPOSED(0, 1, 31)
      DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
      LOAD_FROM_TRANSPOSED(31, 17, 15)
      DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
      // part of stage 2
      q4s16 = vaddq_s16(q0s16, q1s16);
      q13s16 = vsubq_s16(q0s16, q1s16);
      q6s16 = vaddq_s16(q2s16, q3s16);
      q14s16 = vsubq_s16(q2s16, q3s16);
      // part of stage 3
      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
      // generate 18,19,28,29
      // part of stage 1
      LOAD_FROM_TRANSPOSED(15, 9, 23)
      DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
      LOAD_FROM_TRANSPOSED(23, 25, 7)
      DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
      // part of stage 2
      q13s16 = vsubq_s16(q3s16, q2s16);
      q3s16 = vaddq_s16(q3s16, q2s16);
      q14s16 = vsubq_s16(q1s16, q0s16);
      q2s16 = vaddq_s16(q1s16, q0s16);
      // part of stage 3
      DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
      // part of stage 4
      q8s16 = vaddq_s16(q4s16, q2s16);
      q9s16 = vaddq_s16(q5s16, q0s16);
      q10s16 = vaddq_s16(q7s16, q1s16);
      q15s16 = vaddq_s16(q6s16, q3s16);
      q13s16 = vsubq_s16(q5s16, q0s16);
      q14s16 = vsubq_s16(q7s16, q1s16);
      STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
      STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
      // part of stage 5
      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
      STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
      // part of stage 4
      q13s16 = vsubq_s16(q4s16, q2s16);
      q14s16 = vsubq_s16(q6s16, q3s16);
      // part of stage 5
      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
      STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
      // -----------------------------------------
      // BLOCK B: 20-23,24-27
      // -----------------------------------------
      // generate 20,21,26,27
      // part of stage 1
      LOAD_FROM_TRANSPOSED(7, 5, 27)
      DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
      LOAD_FROM_TRANSPOSED(27, 21, 11)
      DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
      // part of stage 2
      q13s16 = vsubq_s16(q0s16, q1s16);
      q0s16 = vaddq_s16(q0s16, q1s16);
      q14s16 = vsubq_s16(q2s16, q3s16);
      q2s16 = vaddq_s16(q2s16, q3s16);
      // part of stage 3
      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
      // generate 22,23,24,25
      // part of stage 1
      LOAD_FROM_TRANSPOSED(11, 13, 19)
      DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
      LOAD_FROM_TRANSPOSED(19, 29, 3)
      DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
      // part of stage 2
      q14s16 = vsubq_s16(q4s16, q5s16);
      q5s16 = vaddq_s16(q4s16, q5s16);
      q13s16 = vsubq_s16(q6s16, q7s16);
      q6s16 = vaddq_s16(q6s16, q7s16);
      // part of stage 3
      DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
      // part of stage 4
      q10s16 = vaddq_s16(q7s16, q1s16);
      q11s16 = vaddq_s16(q5s16, q0s16);
      q12s16 = vaddq_s16(q6s16, q2s16);
      q15s16 = vaddq_s16(q4s16, q3s16);
      // part of stage 6
      LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
      q8s16 = vaddq_s16(q14s16, q11s16);
      q9s16 = vaddq_s16(q13s16, q10s16);
      q13s16 = vsubq_s16(q13s16, q10s16);
      q11s16 = vsubq_s16(q14s16, q11s16);
      STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
      LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
      q8s16 = vsubq_s16(q9s16, q12s16);
      q10s16 = vaddq_s16(q14s16, q15s16);
      q14s16 = vsubq_s16(q14s16, q15s16);
      q12s16 = vaddq_s16(q9s16, q12s16);
      STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
      // part of stage 7
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
      STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
      q13s16 = q11s16;
      q14s16 = q8s16;
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
      STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
      // part of stage 4
      q14s16 = vsubq_s16(q5s16, q0s16);
      q13s16 = vsubq_s16(q6s16, q2s16);
      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
      q14s16 = vsubq_s16(q7s16, q1s16);
      q13s16 = vsubq_s16(q4s16, q3s16);
      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
      // part of stage 6
      LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
      q8s16 = vaddq_s16(q14s16, q1s16);
      q9s16 = vaddq_s16(q13s16, q6s16);
      q13s16 = vsubq_s16(q13s16, q6s16);
      q1s16 = vsubq_s16(q14s16, q1s16);
      STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
      LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
      q14s16 = vsubq_s16(q8s16, q5s16);
      q10s16 = vaddq_s16(q8s16, q5s16);
      q11s16 = vaddq_s16(q9s16, q0s16);
      q0s16 = vsubq_s16(q9s16, q0s16);
      STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
      // part of stage 7
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
      STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
      DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
      STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
      // -----------------------------------------
      // BLOCK C: 8-10,11-15
      // -----------------------------------------
      // generate 8,9,14,15
      // part of stage 2
      LOAD_FROM_TRANSPOSED(3, 2, 30)
      DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
      LOAD_FROM_TRANSPOSED(30, 18, 14)
      DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
      // part of stage 3
      q13s16 = vsubq_s16(q0s16, q1s16);
      q0s16 = vaddq_s16(q0s16, q1s16);
      q14s16 = vsubq_s16(q2s16, q3s16);
      q2s16 = vaddq_s16(q2s16, q3s16);
      // part of stage 4
      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
      // generate 10,11,12,13
      // part of stage 2
      LOAD_FROM_TRANSPOSED(14, 10, 22)
      DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
      LOAD_FROM_TRANSPOSED(22, 26, 6)
      DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
      // part of stage 3
      q14s16 = vsubq_s16(q4s16, q5s16);
      q5s16 = vaddq_s16(q4s16, q5s16);
      q13s16 = vsubq_s16(q6s16, q7s16);
      q6s16 = vaddq_s16(q6s16, q7s16);
      // part of stage 4
      DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
      // part of stage 5
      q8s16 = vaddq_s16(q0s16, q5s16);
      q9s16 = vaddq_s16(q1s16, q7s16);
      q13s16 = vsubq_s16(q1s16, q7s16);
      q14s16 = vsubq_s16(q3s16, q4s16);
      q10s16 = vaddq_s16(q3s16, q4s16);
      q15s16 = vaddq_s16(q2s16, q6s16);
      STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
      STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
      // part of stage 6
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
      STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
      q13s16 = vsubq_s16(q0s16, q5s16);
      q14s16 = vsubq_s16(q2s16, q6s16);
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
      STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
      // -----------------------------------------
      // BLOCK D: 0-3,4-7
      // -----------------------------------------
      // generate 4,5,6,7
      // part of stage 3
      LOAD_FROM_TRANSPOSED(6, 4, 28)
      DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
      LOAD_FROM_TRANSPOSED(28, 20, 12)
      DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
      // part of stage 4
      q13s16 = vsubq_s16(q0s16, q1s16);
      q0s16 = vaddq_s16(q0s16, q1s16);
      q14s16 = vsubq_s16(q2s16, q3s16);
      q2s16 = vaddq_s16(q2s16, q3s16);
      // part of stage 5
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
      // generate 0,1,2,3
      // part of stage 4
      LOAD_FROM_TRANSPOSED(12, 0, 16)
      DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
      LOAD_FROM_TRANSPOSED(16, 8, 24)
      DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
      // part of stage 5
      q4s16 = vaddq_s16(q7s16, q6s16);
      q7s16 = vsubq_s16(q7s16, q6s16);
      q6s16 = vsubq_s16(q5s16, q14s16);
      q5s16 = vaddq_s16(q5s16, q14s16);
      // part of stage 6
      q8s16 = vaddq_s16(q4s16, q2s16);
      q9s16 = vaddq_s16(q5s16, q3s16);
      q10s16 = vaddq_s16(q6s16, q1s16);
      q11s16 = vaddq_s16(q7s16, q0s16);
      q12s16 = vsubq_s16(q7s16, q0s16);
      q13s16 = vsubq_s16(q6s16, q1s16);
      q14s16 = vsubq_s16(q5s16, q3s16);
      q15s16 = vsubq_s16(q4s16, q2s16);
      // part of stage 7
      LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
      q2s16 = vaddq_s16(q8s16, q1s16);
      q3s16 = vaddq_s16(q9s16, q0s16);
      q4s16 = vsubq_s16(q9s16, q0s16);
      q5s16 = vsubq_s16(q8s16, q1s16);
      LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
      q8s16 = vaddq_s16(q4s16, q1s16);
      q9s16 = vaddq_s16(q5s16, q0s16);
      q6s16 = vsubq_s16(q5s16, q0s16);
      q7s16 = vsubq_s16(q4s16, q1s16);
      if (idct32_pass_loop == 0) {
        idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
                                  q10s16, q11s16, q12s16, q13s16, q14s16,
                                  q15s16);
      } else {
        idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
                                  q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
                                  q14s16, q15s16);
        dest += 8;
      }
    }
  }
  return;
 }
--- a/aom_dsp/arm/idct4x4_1_add_neon.asm
+++ b/aom_dsp/arm/idct4x4_1_add_neon.asm
@@ -1,28 +1,31 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license and patent
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  grant that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. All contributing project authors may be found in the AUTHORS
+; was not distributed with this source code in the LICENSE file, you can
-;  file in the root of the source tree.
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_idct4x4_1_add_neon|
+
    EXPORT  |aom_idct4x4_1_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
-|vpx_idct4x4_1_add_neon| PROC
+|aom_idct4x4_1_add_neon| PROC
    ldrsh            r0, [r0]
    ; generate cospi_16_64 = 11585
@@ -63,6 +66,6 @@
    vst1.32          {d7[1]}, [r12]
    bx               lr
-    ENDP             ; |vpx_idct4x4_1_add_neon|
+    ENDP             ; |aom_idct4x4_1_add_neon|
    END
--- a/aom_dsp/arm/idct4x4_1_add_neon.c
+++ b/aom_dsp/arm/idct4x4_1_add_neon.c
@@ -0,0 +1,47 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "aom_dsp/inv_txfm.h"
 #include "aom_ports/mem.h"
 void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x8_t d6u8;
  uint32x2_t d2u32 = vdup_n_u32(0);
  uint16x8_t q8u16;
  int16x8_t q0s16;
  uint8_t *d1, *d2;
  int16_t i, a1;
  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  out = dct_const_round_shift(out * cospi_16_64);
  a1 = ROUND_POWER_OF_TWO(out, 4);
  q0s16 = vdupq_n_s16(a1);
  // dc_only_idct_add
  d1 = d2 = dest;
  for (i = 0; i < 2; i++) {
    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
    d1 += dest_stride;
    d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
    d1 += dest_stride;
    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
    d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
    d2 += dest_stride;
    vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
    d2 += dest_stride;
  }
  return;
 }
--- a/aom_dsp/arm/idct4x4_add_neon.asm
+++ b/aom_dsp/arm/idct4x4_add_neon.asm
@@ -1,14 +1,17 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_idct4x4_16_add_neon|
+;
    EXPORT  |aom_idct4x4_16_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -16,13 +19,13 @@
    AREA ||.text||, CODE, READONLY, ALIGN=2
    AREA     Block, CODE, READONLY ; name this block of code
-;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
-|vpx_idct4x4_16_add_neon| PROC
+|aom_idct4x4_16_add_neon| PROC
    ; The 2D transform is done with two passes which are actually pretty
    ; similar. We first transform the rows. This is done by transposing
@@ -185,6 +188,6 @@
    vst1.32 {d26[1]}, [r1], r2
    vst1.32 {d26[0]}, [r1]  ; no post-increment
    bx              lr
-    ENDP  ; |vpx_idct4x4_16_add_neon|
+    ENDP  ; |aom_idct4x4_16_add_neon|
    END
--- a/aom_dsp/arm/idct4x4_add_neon.c
+++ b/aom_dsp/arm/idct4x4_add_neon.c
@@ -0,0 +1,146 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "aom_dsp/txfm_common.h"
 void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x8_t d26u8, d27u8;
  uint32x2_t d26u32, d27u32;
  uint16x8_t q8u16, q9u16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
  int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
  int16x8_t q8s16, q9s16, q13s16, q14s16;
  int32x4_t q1s32, q13s32, q14s32, q15s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;
  uint8_t *d;
  d26u32 = d27u32 = vdup_n_u32(0);
  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);
  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_low_s16(q9s16);
  d19s16 = vget_high_s16(q9s16);
  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
  d20s16 = vdup_n_s16((int16_t)cospi_8_64);
  d21s16 = vdup_n_s16((int16_t)cospi_16_64);
  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d22s16 = vdup_n_s16((int16_t)cospi_24_64);
  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);
  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);
  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);
  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);
  d16s16 = vget_low_s16(q8s16);
  d17s16 = vget_high_s16(q8s16);
  d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
  d19s16 = vget_low_s16(q9s16);
  d0x2s16 = vtrn_s16(d16s16, d17s16);
  d1x2s16 = vtrn_s16(d18s16, d19s16);
  q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
  q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
  d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
  d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
  // do the transform on columns
  // stage 1
  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);
  q15s32 = vmull_s16(d17s16, d22s16);
  q1s32 = vmull_s16(d17s16, d20s16);
  q13s32 = vmull_s16(d23s16, d21s16);
  q14s32 = vmull_s16(d24s16, d21s16);
  q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
  q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q1s32, 14);
  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);
  // stage 2
  q8s16 = vaddq_s16(q13s16, q14s16);
  q9s16 = vsubq_s16(q13s16, q14s16);
  q8s16 = vrshrq_n_s16(q8s16, 4);
  q9s16 = vrshrq_n_s16(q9s16, 4);
  d = dest;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
  d += dest_stride;
  d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
  d += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  d = dest;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
  d += dest_stride;
  vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
  return;
 }
--- a/aom_dsp/arm/idct8x8_1_add_neon.asm
+++ b/aom_dsp/arm/idct8x8_1_add_neon.asm
@@ -1,28 +1,31 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license and patent
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  grant that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. All contributing project authors may be found in the AUTHORS
+; was not distributed with this source code in the LICENSE file, you can
-;  file in the root of the source tree.
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_idct8x8_1_add_neon|
+
    EXPORT  |aom_idct8x8_1_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
 ;                                  int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
-|vpx_idct8x8_1_add_neon| PROC
+|aom_idct8x8_1_add_neon| PROC
    ldrsh            r0, [r0]
    ; generate cospi_16_64 = 11585
@@ -83,6 +86,6 @@
    vst1.64          {d31}, [r12], r2
    bx               lr
-    ENDP             ; |vpx_idct8x8_1_add_neon|
+    ENDP             ; |aom_idct8x8_1_add_neon|
    END
--- a/aom_dsp/arm/idct8x8_1_add_neon.c
+++ b/aom_dsp/arm/idct8x8_1_add_neon.c
@@ -0,0 +1,62 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "aom_dsp/inv_txfm.h"
 #include "aom_ports/mem.h"
 void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8x8_t d2u8, d3u8, d30u8, d31u8;
  uint64x1_t d2u64, d3u64, d4u64, d5u64;
  uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
  int16x8_t q0s16;
  uint8_t *d1, *d2;
  int16_t i, a1;
  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
  out = dct_const_round_shift(out * cospi_16_64);
  a1 = ROUND_POWER_OF_TWO(out, 5);
  q0s16 = vdupq_n_s16(a1);
  q0u16 = vreinterpretq_u16_s16(q0s16);
  d1 = d2 = dest;
  for (i = 0; i < 2; i++) {
    d2u64 = vld1_u64((const uint64_t *)d1);
    d1 += dest_stride;
    d3u64 = vld1_u64((const uint64_t *)d1);
    d1 += dest_stride;
    d4u64 = vld1_u64((const uint64_t *)d1);
    d1 += dest_stride;
    d5u64 = vld1_u64((const uint64_t *)d1);
    d1 += dest_stride;
    q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
    q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
    q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
    q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
    d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
    d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
    d2 += dest_stride;
    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
    d2 += dest_stride;
  }
  return;
 }
--- a/aom_dsp/arm/idct8x8_add_neon.asm
+++ b/aom_dsp/arm/idct8x8_add_neon.asm
@@ -1,15 +1,18 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_idct8x8_64_add_neon|
+;
-    EXPORT  |vpx_idct8x8_12_add_neon|
+
    EXPORT  |aom_idct8x8_64_add_neon|
    EXPORT  |aom_idct8x8_12_add_neon|
    ARM
    REQUIRE8
    PRESERVE8
@@ -198,13 +201,13 @@
    MEND
    AREA    Block, CODE, READONLY ; name this block of code
-;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
-|vpx_idct8x8_64_add_neon| PROC
+|aom_idct8x8_64_add_neon| PROC
    push            {r4-r9}
    vpush           {d8-d15}
    vld1.s16        {q8,q9}, [r0]!
@@ -308,15 +311,15 @@
    vpop            {d8-d15}
    pop             {r4-r9}
    bx              lr
-    ENDP  ; |vpx_idct8x8_64_add_neon|
+    ENDP  ; |aom_idct8x8_64_add_neon|
-;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
 ; r0  int16_t input
 ; r1  uint8_t *dest
 ; r2  int dest_stride)
-|vpx_idct8x8_12_add_neon| PROC
+|aom_idct8x8_12_add_neon| PROC
    push            {r4-r9}
    vpush           {d8-d15}
    vld1.s16        {q8,q9}, [r0]!
@@ -514,6 +517,6 @@
    vpop            {d8-d15}
    pop             {r4-r9}
    bx              lr
-    ENDP  ; |vpx_idct8x8_12_add_neon|
+    ENDP  ; |aom_idct8x8_12_add_neon|
    END
--- a/aom_dsp/arm/idct8x8_add_neon.c
+++ b/aom_dsp/arm/idct8x8_add_neon.c
@@ -0,0 +1,509 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_config.h"
 #include "aom_dsp/txfm_common.h"
 static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
                                int16x8_t *q10s16, int16x8_t *q11s16,
                                int16x8_t *q12s16, int16x8_t *q13s16,
                                int16x8_t *q14s16, int16x8_t *q15s16) {
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
  int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);
  d20s16 = vget_low_s16(*q10s16);
  d21s16 = vget_high_s16(*q10s16);
  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d30s16 = vget_low_s16(*q15s16);
  d31s16 = vget_high_s16(*q15s16);
  *q8s16 = vcombine_s16(d16s16, d24s16);   // vswp d17, d24
  *q9s16 = vcombine_s16(d18s16, d26s16);   // vswp d19, d26
  *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
  *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
  *q12s16 = vcombine_s16(d17s16, d25s16);
  *q13s16 = vcombine_s16(d19s16, d27s16);
  *q14s16 = vcombine_s16(d21s16, d29s16);
  *q15s16 = vcombine_s16(d23s16, d31s16);
  q0x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
  q1x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
  q2x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
  q3x2s32 =
      vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
  q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
                      vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
  q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
                      vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
  q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
                      vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
  q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
                      vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
  *q8s16 = q0x2s16.val[0];
  *q9s16 = q0x2s16.val[1];
  *q10s16 = q1x2s16.val[0];
  *q11s16 = q1x2s16.val[1];
  *q12s16 = q2x2s16.val[0];
  *q13s16 = q2x2s16.val[1];
  *q14s16 = q3x2s16.val[0];
  *q15s16 = q3x2s16.val[1];
  return;
 }
 static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
                              int16x8_t *q10s16, int16x8_t *q11s16,
                              int16x8_t *q12s16, int16x8_t *q13s16,
                              int16x8_t *q14s16, int16x8_t *q15s16) {
  int16x4_t d0s16, d1s16, d2s16, d3s16;
  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
  d0s16 = vdup_n_s16((int16_t)cospi_28_64);
  d1s16 = vdup_n_s16((int16_t)cospi_4_64);
  d2s16 = vdup_n_s16((int16_t)cospi_12_64);
  d3s16 = vdup_n_s16((int16_t)cospi_20_64);
  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);
  d20s16 = vget_low_s16(*q10s16);
  d21s16 = vget_high_s16(*q10s16);
  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d30s16 = vget_low_s16(*q15s16);
  d31s16 = vget_high_s16(*q15s16);
  q2s32 = vmull_s16(d18s16, d0s16);
  q3s32 = vmull_s16(d19s16, d0s16);
  q5s32 = vmull_s16(d26s16, d2s16);
  q6s32 = vmull_s16(d27s16, d2s16);
  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
  d8s16 = vqrshrn_n_s32(q2s32, 14);
  d9s16 = vqrshrn_n_s32(q3s32, 14);
  d10s16 = vqrshrn_n_s32(q5s32, 14);
  d11s16 = vqrshrn_n_s32(q6s32, 14);
  q4s16 = vcombine_s16(d8s16, d9s16);
  q5s16 = vcombine_s16(d10s16, d11s16);
  q2s32 = vmull_s16(d18s16, d1s16);
  q3s32 = vmull_s16(d19s16, d1s16);
  q9s32 = vmull_s16(d26s16, d3s16);
  q13s32 = vmull_s16(d27s16, d3s16);
  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
  d14s16 = vqrshrn_n_s32(q2s32, 14);
  d15s16 = vqrshrn_n_s32(q3s32, 14);
  d12s16 = vqrshrn_n_s32(q9s32, 14);
  d13s16 = vqrshrn_n_s32(q13s32, 14);
  q6s16 = vcombine_s16(d12s16, d13s16);
  q7s16 = vcombine_s16(d14s16, d15s16);
  d0s16 = vdup_n_s16((int16_t)cospi_16_64);
  q2s32 = vmull_s16(d16s16, d0s16);
  q3s32 = vmull_s16(d17s16, d0s16);
  q13s32 = vmull_s16(d16s16, d0s16);
  q15s32 = vmull_s16(d17s16, d0s16);
  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
  d0s16 = vdup_n_s16((int16_t)cospi_24_64);
  d1s16 = vdup_n_s16((int16_t)cospi_8_64);
  d18s16 = vqrshrn_n_s32(q2s32, 14);
  d19s16 = vqrshrn_n_s32(q3s32, 14);
  d22s16 = vqrshrn_n_s32(q13s32, 14);
  d23s16 = vqrshrn_n_s32(q15s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  *q11s16 = vcombine_s16(d22s16, d23s16);
  q2s32 = vmull_s16(d20s16, d0s16);
  q3s32 = vmull_s16(d21s16, d0s16);
  q8s32 = vmull_s16(d20s16, d1s16);
  q12s32 = vmull_s16(d21s16, d1s16);
  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
  d26s16 = vqrshrn_n_s32(q2s32, 14);
  d27s16 = vqrshrn_n_s32(q3s32, 14);
  d30s16 = vqrshrn_n_s32(q8s32, 14);
  d31s16 = vqrshrn_n_s32(q12s32, 14);
  *q13s16 = vcombine_s16(d26s16, d27s16);
  *q15s16 = vcombine_s16(d30s16, d31s16);
  q0s16 = vaddq_s16(*q9s16, *q15s16);
  q1s16 = vaddq_s16(*q11s16, *q13s16);
  q2s16 = vsubq_s16(*q11s16, *q13s16);
  q3s16 = vsubq_s16(*q9s16, *q15s16);
  *q13s16 = vsubq_s16(q4s16, q5s16);
  q4s16 = vaddq_s16(q4s16, q5s16);
  *q14s16 = vsubq_s16(q7s16, q6s16);
  q7s16 = vaddq_s16(q7s16, q6s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
  q9s32 = vmull_s16(d28s16, d16s16);
  q10s32 = vmull_s16(d29s16, d16s16);
  q11s32 = vmull_s16(d28s16, d16s16);
  q12s32 = vmull_s16(d29s16, d16s16);
  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
  d10s16 = vqrshrn_n_s32(q9s32, 14);
  d11s16 = vqrshrn_n_s32(q10s32, 14);
  d12s16 = vqrshrn_n_s32(q11s32, 14);
  d13s16 = vqrshrn_n_s32(q12s32, 14);
  q5s16 = vcombine_s16(d10s16, d11s16);
  q6s16 = vcombine_s16(d12s16, d13s16);
  *q8s16 = vaddq_s16(q0s16, q7s16);
  *q9s16 = vaddq_s16(q1s16, q6s16);
  *q10s16 = vaddq_s16(q2s16, q5s16);
  *q11s16 = vaddq_s16(q3s16, q4s16);
  *q12s16 = vsubq_s16(q3s16, q4s16);
  *q13s16 = vsubq_s16(q2s16, q5s16);
  *q14s16 = vsubq_s16(q1s16, q6s16);
  *q15s16 = vsubq_s16(q0s16, q7s16);
  return;
 }
 void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8_t *d1, *d2;
  uint8x8_t d0u8, d1u8, d2u8, d3u8;
  uint64x1_t d0u64, d1u64, d2u64, d3u64;
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  uint16x8_t q8u16, q9u16, q10u16, q11u16;
  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);
  q10s16 = vld1q_s16(input + 16);
  q11s16 = vld1q_s16(input + 24);
  q12s16 = vld1q_s16(input + 32);
  q13s16 = vld1q_s16(input + 40);
  q14s16 = vld1q_s16(input + 48);
  q15s16 = vld1q_s16(input + 56);
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
               &q15s16);
  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
             &q15s16);
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
               &q15s16);
  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
             &q15s16);
  q8s16 = vrshrq_n_s16(q8s16, 5);
  q9s16 = vrshrq_n_s16(q9s16, 5);
  q10s16 = vrshrq_n_s16(q10s16, 5);
  q11s16 = vrshrq_n_s16(q11s16, 5);
  q12s16 = vrshrq_n_s16(q12s16, 5);
  q13s16 = vrshrq_n_s16(q13s16, 5);
  q14s16 = vrshrq_n_s16(q14s16, 5);
  q15s16 = vrshrq_n_s16(q15s16, 5);
  d1 = d2 = dest;
  d0u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d1u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d2u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d3u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  d2 += dest_stride;
  q8s16 = q12s16;
  q9s16 = q13s16;
  q10s16 = q14s16;
  q11s16 = q15s16;
  d0u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d1u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d2u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d3u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  d2 += dest_stride;
  return;
 }
 void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
  uint8_t *d1, *d2;
  uint8x8_t d0u8, d1u8, d2u8, d3u8;
  int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
  int16x4_t d26s16, d27s16, d28s16, d29s16;
  uint64x1_t d0u64, d1u64, d2u64, d3u64;
  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
  uint16x8_t q8u16, q9u16, q10u16, q11u16;
  int32x4_t q9s32, q10s32, q11s32, q12s32;
  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);
  q10s16 = vld1q_s16(input + 16);
  q11s16 = vld1q_s16(input + 24);
  q12s16 = vld1q_s16(input + 32);
  q13s16 = vld1q_s16(input + 40);
  q14s16 = vld1q_s16(input + 48);
  q15s16 = vld1q_s16(input + 56);
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
               &q15s16);
  // First transform rows
  // stage 1
  q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
  q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
  q4s16 = vqrdmulhq_s16(q9s16, q0s16);
  q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
  q7s16 = vqrdmulhq_s16(q9s16, q1s16);
  q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
  q5s16 = vqrdmulhq_s16(q11s16, q0s16);
  q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
  q6s16 = vqrdmulhq_s16(q11s16, q1s16);
  // stage 2 & stage 3 - even half
  q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
  q9s16 = vqrdmulhq_s16(q8s16, q0s16);
  q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
  q13s16 = vqrdmulhq_s16(q10s16, q1s16);
  q15s16 = vqrdmulhq_s16(q10s16, q0s16);
  // stage 3 -odd half
  q0s16 = vaddq_s16(q9s16, q15s16);
  q1s16 = vaddq_s16(q9s16, q13s16);
  q2s16 = vsubq_s16(q9s16, q13s16);
  q3s16 = vsubq_s16(q9s16, q15s16);
  // stage 2 - odd half
  q13s16 = vsubq_s16(q4s16, q5s16);
  q4s16 = vaddq_s16(q4s16, q5s16);
  q14s16 = vsubq_s16(q7s16, q6s16);
  q7s16 = vaddq_s16(q7s16, q6s16);
  d26s16 = vget_low_s16(q13s16);
  d27s16 = vget_high_s16(q13s16);
  d28s16 = vget_low_s16(q14s16);
  d29s16 = vget_high_s16(q14s16);
  d16s16 = vdup_n_s16((int16_t)cospi_16_64);
  q9s32 = vmull_s16(d28s16, d16s16);
  q10s32 = vmull_s16(d29s16, d16s16);
  q11s32 = vmull_s16(d28s16, d16s16);
  q12s32 = vmull_s16(d29s16, d16s16);
  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
  d10s16 = vqrshrn_n_s32(q9s32, 14);
  d11s16 = vqrshrn_n_s32(q10s32, 14);
  d12s16 = vqrshrn_n_s32(q11s32, 14);
  d13s16 = vqrshrn_n_s32(q12s32, 14);
  q5s16 = vcombine_s16(d10s16, d11s16);
  q6s16 = vcombine_s16(d12s16, d13s16);
  // stage 4
  q8s16 = vaddq_s16(q0s16, q7s16);
  q9s16 = vaddq_s16(q1s16, q6s16);
  q10s16 = vaddq_s16(q2s16, q5s16);
  q11s16 = vaddq_s16(q3s16, q4s16);
  q12s16 = vsubq_s16(q3s16, q4s16);
  q13s16 = vsubq_s16(q2s16, q5s16);
  q14s16 = vsubq_s16(q1s16, q6s16);
  q15s16 = vsubq_s16(q0s16, q7s16);
  TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
               &q15s16);
  IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
             &q15s16);
  q8s16 = vrshrq_n_s16(q8s16, 5);
  q9s16 = vrshrq_n_s16(q9s16, 5);
  q10s16 = vrshrq_n_s16(q10s16, 5);
  q11s16 = vrshrq_n_s16(q11s16, 5);
  q12s16 = vrshrq_n_s16(q12s16, 5);
  q13s16 = vrshrq_n_s16(q13s16, 5);
  q14s16 = vrshrq_n_s16(q14s16, 5);
  q15s16 = vrshrq_n_s16(q15s16, 5);
  d1 = d2 = dest;
  d0u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d1u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d2u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d3u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  d2 += dest_stride;
  q8s16 = q12s16;
  q9s16 = q13s16;
  q10s16 = q14s16;
  q11s16 = q15s16;
  d0u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d1u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d2u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  d3u64 = vld1_u64((uint64_t *)d1);
  d1 += dest_stride;
  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
  q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
  q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
  d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
  d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
  d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
  d2 += dest_stride;
  vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
  d2 += dest_stride;
  return;
 }
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1,26 +1,26 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
 //------------------------------------------------------------------------------
 // DC 4x4
 // 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *above, const uint8_t *left,
+                          const uint8_t *left, int do_above, int do_left) {
                          int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;
@@ -33,7 +33,7 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
  }
  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint8x8_t L = vld1_u8(left);   // left border
    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    sum_left = vcombine_u16(p1, p1);
@@ -54,29 +54,29 @@ static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
    }
  }
 }
-void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  dc_4x4(dst, stride, above, left, 1, 1);
 }
-void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
  (void)above;
  dc_4x4(dst, stride, NULL, left, 0, 1);
 }
-void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
  (void)left;
  dc_4x4(dst, stride, above, NULL, 1, 0);
 }
-void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
  (void)above;
  (void)left;
@@ -87,9 +87,8 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 // DC 8x8
 // 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *above, const uint8_t *left,
+                          const uint8_t *left, int do_above, int do_left) {
                          int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;
@@ -103,7 +102,7 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
  }
  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);  // left border
+    const uint8x8_t L = vld1_u8(left);   // left border
    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
@@ -125,29 +124,29 @@ static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
+      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
    }
  }
 }
-void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  dc_8x8(dst, stride, above, left, 1, 1);
 }
-void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
  (void)above;
  dc_8x8(dst, stride, NULL, left, 0, 1);
 }
-void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
  (void)left;
  dc_8x8(dst, stride, above, NULL, 1, 0);
 }
-void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
  (void)above;
  (void)left;
@@ -167,7 +166,7 @@ static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
  if (do_above) {
    const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
@@ -203,26 +202,26 @@ static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
  }
 }
-void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  dc_16x16(dst, stride, above, left, 1, 1);
 }
-void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
  (void)above;
  dc_16x16(dst, stride, NULL, left, 0, 1);
 }
-void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)left;
  dc_16x16(dst, stride, above, NULL, 1, 0);
 }
-void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)above;
@@ -286,26 +285,26 @@ static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
  }
 }
-void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  dc_32x32(dst, stride, above, left, 1, 1);
 }
-void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
  (void)above;
  dc_32x32(dst, stride, NULL, left, 0, 1);
 }
-void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)left;
  dc_32x32(dst, stride, above, NULL, 1, 0);
 }
-void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above,
                                     const uint8_t *left) {
  (void)above;
@@ -315,7 +314,7 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 // -----------------------------------------------------------------------------
-void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
@@ -338,7 +337,7 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  dst[3 * stride + 3] = above[7];
 }
-void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
@@ -358,7 +357,7 @@ void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  vst1_u8(dst + i * stride, row);
 }
-void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
  const uint8x16_t A0 = vld1q_u8(above);  // top row
  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
@@ -377,7 +376,7 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 // -----------------------------------------------------------------------------
-void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
@@ -407,7 +406,7 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 #if !HAVE_NEON_ASM
-void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  int i;
  uint32x2_t d0u32 = vdup_n_u32(0);
@@ -418,29 +417,27 @@ void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
 }
-void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x8_t d0u8 = vdup_n_u8(0);
  (void)left;
  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride)
+  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
    vst1_u8(dst, d0u8);
 }
-void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x16_t q0u8 = vdupq_n_u8(0);
  (void)left;
  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride)
+  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
    vst1q_u8(dst, q0u8);
 }
-void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x16_t q0u8 = vdupq_n_u8(0);
@@ -455,7 +452,7 @@ void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  }
 }
-void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint32x2_t d1u32 = vdup_n_u32(0);
@@ -476,7 +473,7 @@ void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
 }
-void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint64x1_t d1u64 = vdup_n_u64(0);
@@ -509,7 +506,7 @@ void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  vst1_u8(dst, d0u8);
 }
-void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int j;
  uint8x8_t d2u8 = vdup_n_u8(0);
@@ -547,7 +544,7 @@ void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  }
 }
-void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint8x8_t d2u8 = vdup_n_u8(0);
@@ -595,7 +592,7 @@ void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
  }
 }
-void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  int i;
  uint16x8_t q1u16, q3u16;
@@ -608,14 +605,14 @@ void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
  for (i = 0; i < 4; i++, dst += stride) {
    q1u16 = vdupq_n_u16((uint16_t)left[i]);
-    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+    q1s16 =
-                      vreinterpretq_s16_u16(q3u16));
+        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
    d0u8 = vqmovun_s16(q1s16);
    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  }
 }
-void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  int j;
  uint16x8_t q0u16, q3u16, q10u16;
@@ -631,33 +628,33 @@ void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
  d20u16 = vget_low_u16(q10u16);
  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
    q0u16 = vdupq_lane_u16(d20u16, 0);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+    q0s16 =
-                      vreinterpretq_s16_u16(q0u16));
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
    q0u16 = vdupq_lane_u16(d20u16, 1);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+    q0s16 =
-                      vreinterpretq_s16_u16(q0u16));
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
    q0u16 = vdupq_lane_u16(d20u16, 2);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+    q0s16 =
-                      vreinterpretq_s16_u16(q0u16));
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
    q0u16 = vdupq_lane_u16(d20u16, 3);
-    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+    q0s16 =
-                      vreinterpretq_s16_u16(q0u16));
+        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
    d0u8 = vqmovun_s16(q0s16);
    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
    dst += stride;
  }
 }
-void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
@@ -677,14 +674,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
      q0u16 = vdupq_lane_u16(d20u16, 0);
      q8u16 = vdupq_lane_u16(d20u16, 1);
-      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q1s16 =
-                        vreinterpretq_s16_u16(q2u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q0s16 =
-                        vreinterpretq_s16_u16(q3u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+      q11s16 =
-                         vreinterpretq_s16_u16(q2u16));
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+      q8s16 =
-                        vreinterpretq_s16_u16(q3u16));
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
      d2u8 = vqmovun_s16(q1s16);
      d3u8 = vqmovun_s16(q0s16);
      d22u8 = vqmovun_s16(q11s16);
@@ -698,14 +695,14 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
      q0u16 = vdupq_lane_u16(d20u16, 2);
      q8u16 = vdupq_lane_u16(d20u16, 3);
-      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q1s16 =
-                        vreinterpretq_s16_u16(q2u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q0s16 =
-                        vreinterpretq_s16_u16(q3u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+      q11s16 =
-                         vreinterpretq_s16_u16(q2u16));
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+      q8s16 =
-                        vreinterpretq_s16_u16(q3u16));
+          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
      d2u8 = vqmovun_s16(q1s16);
      d3u8 = vqmovun_s16(q0s16);
      d22u8 = vqmovun_s16(q11s16);
@@ -720,7 +717,7 @@ void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
  }
 }
-void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
@@ -742,10 +739,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
    d6u16 = vget_low_u16(q3u16);
    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
      q0u16 = vdupq_lane_u16(d6u16, 0);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q12s16 =
-                         vreinterpretq_s16_u16(q8u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q13s16 =
-                         vreinterpretq_s16_u16(q9u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -761,10 +758,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
      dst += stride;
      q0u16 = vdupq_lane_u16(d6u16, 1);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q12s16 =
-                         vreinterpretq_s16_u16(q8u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q13s16 =
-                         vreinterpretq_s16_u16(q9u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -780,10 +777,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
      dst += stride;
      q0u16 = vdupq_lane_u16(d6u16, 2);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q12s16 =
-                         vreinterpretq_s16_u16(q8u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q13s16 =
-                         vreinterpretq_s16_u16(q9u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
@@ -799,10 +796,10 @@ void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
      dst += stride;
      q0u16 = vdupq_lane_u16(d6u16, 3);
-      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q12s16 =
-                         vreinterpretq_s16_u16(q8u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+      q13s16 =
-                         vreinterpretq_s16_u16(q9u16));
+          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
                         vreinterpretq_s16_u16(q10u16));
      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
--- a/aom_dsp/arm/intrapred_neon_asm.asm
+++ b/aom_dsp/arm/intrapred_neon_asm.asm
@@ -1,32 +1,35 @@
 ;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_v_predictor_4x4_neon|
+;
-    EXPORT  |vpx_v_predictor_8x8_neon|
+
-    EXPORT  |vpx_v_predictor_16x16_neon|
+    EXPORT  |aom_v_predictor_4x4_neon|
-    EXPORT  |vpx_v_predictor_32x32_neon|
+    EXPORT  |aom_v_predictor_8x8_neon|
-    EXPORT  |vpx_h_predictor_4x4_neon|
+    EXPORT  |aom_v_predictor_16x16_neon|
-    EXPORT  |vpx_h_predictor_8x8_neon|
+    EXPORT  |aom_v_predictor_32x32_neon|
-    EXPORT  |vpx_h_predictor_16x16_neon|
+    EXPORT  |aom_h_predictor_4x4_neon|
-    EXPORT  |vpx_h_predictor_32x32_neon|
+    EXPORT  |aom_h_predictor_8x8_neon|
-    EXPORT  |vpx_tm_predictor_4x4_neon|
+    EXPORT  |aom_h_predictor_16x16_neon|
-    EXPORT  |vpx_tm_predictor_8x8_neon|
+    EXPORT  |aom_h_predictor_32x32_neon|
-    EXPORT  |vpx_tm_predictor_16x16_neon|
+    EXPORT  |aom_tm_predictor_4x4_neon|
-    EXPORT  |vpx_tm_predictor_32x32_neon|
+    EXPORT  |aom_tm_predictor_8x8_neon|
    EXPORT  |aom_tm_predictor_16x16_neon|
    EXPORT  |aom_tm_predictor_32x32_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -34,16 +37,16 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_v_predictor_4x4_neon| PROC
+|aom_v_predictor_4x4_neon| PROC
    vld1.32             {d0[0]}, [r2]
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d0[0]}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_v_predictor_4x4_neon|
+    ENDP                ; |aom_v_predictor_4x4_neon|
-;void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -51,7 +54,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_v_predictor_8x8_neon| PROC
+|aom_v_predictor_8x8_neon| PROC
    vld1.8              {d0}, [r2]
    vst1.8              {d0}, [r0], r1
    vst1.8              {d0}, [r0], r1
@@ -62,9 +65,9 @@
    vst1.8              {d0}, [r0], r1
    vst1.8              {d0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_v_predictor_8x8_neon|
+    ENDP                ; |aom_v_predictor_8x8_neon|
-;void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -72,7 +75,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_v_predictor_16x16_neon| PROC
+|aom_v_predictor_16x16_neon| PROC
    vld1.8              {q0}, [r2]
    vst1.8              {q0}, [r0], r1
    vst1.8              {q0}, [r0], r1
@@ -91,9 +94,9 @@
    vst1.8              {q0}, [r0], r1
    vst1.8              {q0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_v_predictor_16x16_neon|
+    ENDP                ; |aom_v_predictor_16x16_neon|
-;void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -101,7 +104,7 @@
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_v_predictor_32x32_neon| PROC
+|aom_v_predictor_32x32_neon| PROC
    vld1.8              {q0, q1}, [r2]
    mov                 r2, #2
 loop_v
@@ -124,9 +127,9 @@ loop_v
    subs                r2, r2, #1
    bgt                 loop_v
    bx                  lr
-    ENDP                ; |vpx_v_predictor_32x32_neon|
+    ENDP                ; |aom_v_predictor_32x32_neon|
-;void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -134,7 +137,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_h_predictor_4x4_neon| PROC
+|aom_h_predictor_4x4_neon| PROC
    vld1.32             {d1[0]}, [r3]
    vdup.8              d0, d1[0]
    vst1.32             {d0[0]}, [r0], r1
@@ -145,9 +148,9 @@ loop_v
    vdup.8              d0, d1[3]
    vst1.32             {d0[0]}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_h_predictor_4x4_neon|
+    ENDP                ; |aom_h_predictor_4x4_neon|
-;void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                              const uint8_t *above,
 ;                              const uint8_t *left)
 ; r0  uint8_t *dst
@@ -155,7 +158,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_h_predictor_8x8_neon| PROC
+|aom_h_predictor_8x8_neon| PROC
    vld1.64             {d1}, [r3]
    vdup.8              d0, d1[0]
    vst1.64             {d0}, [r0], r1
@@ -174,9 +177,9 @@ loop_v
    vdup.8              d0, d1[7]
    vst1.64             {d0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_h_predictor_8x8_neon|
+    ENDP                ; |aom_h_predictor_8x8_neon|
-;void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -184,7 +187,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_h_predictor_16x16_neon| PROC
+|aom_h_predictor_16x16_neon| PROC
    vld1.8              {q1}, [r3]
    vdup.8              q0, d2[0]
    vst1.8              {q0}, [r0], r1
@@ -219,9 +222,9 @@ loop_v
    vdup.8              q0, d3[7]
    vst1.8              {q0}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_h_predictor_16x16_neon|
+    ENDP                ; |aom_h_predictor_16x16_neon|
-;void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -229,7 +232,7 @@ loop_v
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_h_predictor_32x32_neon| PROC
+|aom_h_predictor_32x32_neon| PROC
    sub                 r1, r1, #16
    mov                 r2, #2
 loop_h
@@ -285,9 +288,9 @@ loop_h
    subs                r2, r2, #1
    bgt                 loop_h
    bx                  lr
-    ENDP                ; |vpx_h_predictor_32x32_neon|
+    ENDP                ; |aom_h_predictor_32x32_neon|
-;void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -295,7 +298,7 @@ loop_h
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_tm_predictor_4x4_neon| PROC
+|aom_tm_predictor_4x4_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.u8             {d0[]}, [r12]
@@ -331,9 +334,9 @@ loop_h
    vst1.32             {d0[0]}, [r0], r1
    vst1.32             {d1[0]}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_tm_predictor_4x4_neon|
+    ENDP                ; |aom_tm_predictor_4x4_neon|
-;void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -341,7 +344,7 @@ loop_h
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_tm_predictor_8x8_neon| PROC
+|aom_tm_predictor_8x8_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.8              {d0[]}, [r12]
@@ -403,9 +406,9 @@ loop_h
    vst1.64             {d3}, [r0], r1
    bx                  lr
-    ENDP                ; |vpx_tm_predictor_8x8_neon|
+    ENDP                ; |aom_tm_predictor_8x8_neon|
-;void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                const uint8_t *above,
 ;                                const uint8_t *left)
 ; r0  uint8_t *dst
@@ -413,7 +416,7 @@ loop_h
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_tm_predictor_16x16_neon| PROC
+|aom_tm_predictor_16x16_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.8              {d0[]}, [r12]
@@ -496,9 +499,9 @@ loop_16x16_neon
    bgt                 loop_16x16_neon
    bx                  lr
-    ENDP                ; |vpx_tm_predictor_16x16_neon|
+    ENDP                ; |aom_tm_predictor_16x16_neon|
-;void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
 ;                                  const uint8_t *above,
 ;                                  const uint8_t *left)
 ; r0  uint8_t *dst
@@ -506,7 +509,7 @@ loop_16x16_neon
 ; r2  const uint8_t *above
 ; r3  const uint8_t *left
-|vpx_tm_predictor_32x32_neon| PROC
+|aom_tm_predictor_32x32_neon| PROC
    ; Load ytop_left = above[-1];
    sub                 r12, r2, #1
    vld1.8              {d0[]}, [r12]
@@ -625,6 +628,6 @@ loop_32x32_neon
    bgt                 loop_32x32_neon
    bx                  lr
-    ENDP                ; |vpx_tm_predictor_32x32_neon|
+    ENDP                ; |aom_tm_predictor_32x32_neon|
    END
--- a/aom_dsp/arm/loopfilter_16_neon.asm
+++ b/aom_dsp/arm/loopfilter_16_neon.asm
@@ -1,19 +1,22 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_lpf_horizontal_4_dual_neon|
+;
    EXPORT  |aom_lpf_horizontal_4_dual_neon|
    ARM
    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
 ;                                    const uint8_t *blimit0,
 ;                                    const uint8_t *limit0,
 ;                                    const uint8_t *thresh0,
@@ -29,7 +32,7 @@
 ; sp+8  const uint8_t *limit1,
 ; sp+12 const uint8_t *thresh1,
-|vpx_lpf_horizontal_4_dual_neon| PROC
+|aom_lpf_horizontal_4_dual_neon| PROC
    push        {lr}
    ldr         r12, [sp, #4]              ; load thresh0
@@ -66,7 +69,7 @@
    sub         r2, r2, r1, lsl #1
    sub         r3, r3, r1, lsl #1
-    bl          vpx_loop_filter_neon_16
+    bl          aom_loop_filter_neon_16
    vst1.u8     {q5}, [r2@64], r1          ; store op1
    vst1.u8     {q6}, [r3@64], r1          ; store op0
@@ -76,9 +79,9 @@
    vpop        {d8-d15}                   ; restore neon registers
    pop         {pc}
-    ENDP        ; |vpx_lpf_horizontal_4_dual_neon|
+    ENDP        ; |aom_lpf_horizontal_4_dual_neon|
-; void vpx_loop_filter_neon_16();
+; void aom_loop_filter_neon_16();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. This function uses
 ; registers d8-d15, so the calling function must save those registers.
@@ -101,7 +104,7 @@
 ; q6    op0
 ; q7    oq0
 ; q8    oq1
-|vpx_loop_filter_neon_16| PROC
+|aom_loop_filter_neon_16| PROC
    ; filter_mask
    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
@@ -194,6 +197,6 @@
    veor        q8, q12, q10                ; *oq1 = u^0x80
    bx          lr
-    ENDP        ; |vpx_loop_filter_neon_16|
+    ENDP        ; |aom_loop_filter_neon_16|
    END
--- a/aom_dsp/arm/loopfilter_16_neon.c
+++ b/aom_dsp/arm/loopfilter_16_neon.c
@@ -0,0 +1,174 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_dsp_rtcd.h"
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 static INLINE void loop_filter_neon_16(uint8x16_t qblimit,  // blimit
                                       uint8x16_t qlimit,   // limit
                                       uint8x16_t qthresh,  // thresh
                                       uint8x16_t q3,       // p3
                                       uint8x16_t q4,       // p2
                                       uint8x16_t q5,       // p1
                                       uint8x16_t q6,       // p0
                                       uint8x16_t q7,       // q0
                                       uint8x16_t q8,       // q1
                                       uint8x16_t q9,       // q2
                                       uint8x16_t q10,      // q3
                                       uint8x16_t *q5r,     // p1
                                       uint8x16_t *q6r,     // p0
                                       uint8x16_t *q7r,     // q0
                                       uint8x16_t *q8r) {   // q1
  uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
  int16x8_t q2s16, q11s16;
  uint16x8_t q4u16;
  int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
  int8x8_t d2s8, d3s8;
  q11u8 = vabdq_u8(q3, q4);
  q12u8 = vabdq_u8(q4, q5);
  q13u8 = vabdq_u8(q5, q6);
  q14u8 = vabdq_u8(q8, q7);
  q3 = vabdq_u8(q9, q8);
  q4 = vabdq_u8(q10, q9);
  q11u8 = vmaxq_u8(q11u8, q12u8);
  q12u8 = vmaxq_u8(q13u8, q14u8);
  q3 = vmaxq_u8(q3, q4);
  q15u8 = vmaxq_u8(q11u8, q12u8);
  q9 = vabdq_u8(q6, q7);
  // aom_hevmask
  q13u8 = vcgtq_u8(q13u8, qthresh);
  q14u8 = vcgtq_u8(q14u8, qthresh);
  q15u8 = vmaxq_u8(q15u8, q3);
  q2u8 = vabdq_u8(q5, q8);
  q9 = vqaddq_u8(q9, q9);
  q15u8 = vcgeq_u8(qlimit, q15u8);
  // aom_filter() function
  // convert to signed
  q10 = vdupq_n_u8(0x80);
  q8 = veorq_u8(q8, q10);
  q7 = veorq_u8(q7, q10);
  q6 = veorq_u8(q6, q10);
  q5 = veorq_u8(q5, q10);
  q2u8 = vshrq_n_u8(q2u8, 1);
  q9 = vqaddq_u8(q9, q2u8);
  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
                   vget_low_s8(vreinterpretq_s8_u8(q6)));
  q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
                    vget_high_s8(vreinterpretq_s8_u8(q6)));
  q9 = vcgeq_u8(qblimit, q9);
  q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
  q14u8 = vorrq_u8(q13u8, q14u8);
  q4u16 = vdupq_n_u16(3);
  q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
  q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
  q15u8 = vandq_u8(q15u8, q9);
  q1s8 = vreinterpretq_s8_u8(q1u8);
  q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
  q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
  q4 = vdupq_n_u8(3);
  q9 = vdupq_n_u8(4);
  // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0))
  d2s8 = vqmovn_s16(q2s16);
  d3s8 = vqmovn_s16(q11s16);
  q1s8 = vcombine_s8(d2s8, d3s8);
  q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
  q1s8 = vreinterpretq_s8_u8(q1u8);
  q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
  q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
  q2s8 = vshrq_n_s8(q2s8, 3);
  q1s8 = vshrq_n_s8(q1s8, 3);
  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
  q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
  q1s8 = vrshrq_n_s8(q1s8, 1);
  q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
  q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
  q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
  *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
  *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
  *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
  *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
  return;
 }
 void aom_lpf_horizontal_4_dual_neon(
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
    const uint8_t *limit1, const uint8_t *thresh1) {
  uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
  uint8x16_t qblimit, qlimit, qthresh;
  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
  dblimit0 = vld1_u8(blimit0);
  dlimit0 = vld1_u8(limit0);
  dthresh0 = vld1_u8(thresh0);
  dblimit1 = vld1_u8(blimit1);
  dlimit1 = vld1_u8(limit1);
  dthresh1 = vld1_u8(thresh1);
  qblimit = vcombine_u8(dblimit0, dblimit1);
  qlimit = vcombine_u8(dlimit0, dlimit1);
  qthresh = vcombine_u8(dthresh0, dthresh1);
  s -= (p << 2);
  q3u8 = vld1q_u8(s);
  s += p;
  q4u8 = vld1q_u8(s);
  s += p;
  q5u8 = vld1q_u8(s);
  s += p;
  q6u8 = vld1q_u8(s);
  s += p;
  q7u8 = vld1q_u8(s);
  s += p;
  q8u8 = vld1q_u8(s);
  s += p;
  q9u8 = vld1q_u8(s);
  s += p;
  q10u8 = vld1q_u8(s);
  loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
                      q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
  s -= (p * 5);
  vst1q_u8(s, q5u8);
  s += p;
  vst1q_u8(s, q6u8);
  s += p;
  vst1q_u8(s, q7u8);
  s += p;
  vst1q_u8(s, q8u8);
  return;
 }
--- a/aom_dsp/arm/loopfilter_4_neon.asm
+++ b/aom_dsp/arm/loopfilter_4_neon.asm
@@ -1,23 +1,26 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_lpf_horizontal_4_neon|
+;
-    EXPORT  |vpx_lpf_vertical_4_neon|
+
    EXPORT  |aom_lpf_horizontal_4_neon|
    EXPORT  |aom_lpf_vertical_4_neon|
    ARM
    AREA ||.text||, CODE, READONLY, ALIGN=2
-; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
 ; works on 16 iterations at a time.
 ;
-; void vpx_lpf_horizontal_4_neon(uint8_t *s,
+; void aom_lpf_horizontal_4_neon(uint8_t *s,
 ;                                int p /* pitch */,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
@@ -28,7 +31,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_horizontal_4_neon| PROC
+|aom_lpf_horizontal_4_neon| PROC
    push        {lr}
    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -53,7 +56,7 @@
    sub         r2, r2, r1, lsl #1
    sub         r3, r3, r1, lsl #1
-    bl          vpx_loop_filter_neon
+    bl          aom_loop_filter_neon
    vst1.u8     {d4}, [r2@64], r1          ; store op1
    vst1.u8     {d5}, [r3@64], r1          ; store op0
@@ -61,12 +64,12 @@
    vst1.u8     {d7}, [r3@64], r1          ; store oq1
    pop         {pc}
-    ENDP        ; |vpx_lpf_horizontal_4_neon|
+    ENDP        ; |aom_lpf_horizontal_4_neon|
-; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
 ; works on 16 iterations at a time.
 ;
-; void vpx_lpf_vertical_4_neon(uint8_t *s,
+; void aom_lpf_vertical_4_neon(uint8_t *s,
 ;                              int p /* pitch */,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -77,7 +80,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_vertical_4_neon| PROC
+|aom_lpf_vertical_4_neon| PROC
    push        {lr}
    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -113,7 +116,7 @@
    vtrn.8      d7, d16
    vtrn.8      d17, d18
-    bl          vpx_loop_filter_neon
+    bl          aom_loop_filter_neon
    sub         r0, r0, #2
@@ -128,9 +131,9 @@
    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
    pop         {pc}
-    ENDP        ; |vpx_lpf_vertical_4_neon|
+    ENDP        ; |aom_lpf_vertical_4_neon|
-; void vpx_loop_filter_neon();
+; void aom_loop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -154,7 +157,7 @@
 ; d5    op0
 ; d6    oq0
 ; d7    oq1
-|vpx_loop_filter_neon| PROC
+|aom_loop_filter_neon| PROC
    ; filter_mask
    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
@@ -244,6 +247,6 @@
    veor        d7, d20, d18                ; *oq1 = u^0x80
    bx          lr
-    ENDP        ; |vpx_loop_filter_neon|
+    ENDP        ; |aom_loop_filter_neon|
    END
--- a/aom_dsp/arm/loopfilter_4_neon.c
+++ b/aom_dsp/arm/loopfilter_4_neon.c
@@ -0,0 +1,250 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_dsp_rtcd.h"
 static INLINE void loop_filter_neon(uint8x8_t dblimit,   // flimit
                                    uint8x8_t dlimit,    // limit
                                    uint8x8_t dthresh,   // thresh
                                    uint8x8_t d3u8,      // p3
                                    uint8x8_t d4u8,      // p2
                                    uint8x8_t d5u8,      // p1
                                    uint8x8_t d6u8,      // p0
                                    uint8x8_t d7u8,      // q0
                                    uint8x8_t d16u8,     // q1
                                    uint8x8_t d17u8,     // q2
                                    uint8x8_t d18u8,     // q3
                                    uint8x8_t *d4ru8,    // p1
                                    uint8x8_t *d5ru8,    // p0
                                    uint8x8_t *d6ru8,    // q0
                                    uint8x8_t *d7ru8) {  // q1
  uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
  int16x8_t q12s16;
  int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
  d19u8 = vabd_u8(d3u8, d4u8);
  d20u8 = vabd_u8(d4u8, d5u8);
  d21u8 = vabd_u8(d5u8, d6u8);
  d22u8 = vabd_u8(d16u8, d7u8);
  d3u8 = vabd_u8(d17u8, d16u8);
  d4u8 = vabd_u8(d18u8, d17u8);
  d19u8 = vmax_u8(d19u8, d20u8);
  d20u8 = vmax_u8(d21u8, d22u8);
  d3u8 = vmax_u8(d3u8, d4u8);
  d23u8 = vmax_u8(d19u8, d20u8);
  d17u8 = vabd_u8(d6u8, d7u8);
  d21u8 = vcgt_u8(d21u8, dthresh);
  d22u8 = vcgt_u8(d22u8, dthresh);
  d23u8 = vmax_u8(d23u8, d3u8);
  d28u8 = vabd_u8(d5u8, d16u8);
  d17u8 = vqadd_u8(d17u8, d17u8);
  d23u8 = vcge_u8(dlimit, d23u8);
  d18u8 = vdup_n_u8(0x80);
  d5u8 = veor_u8(d5u8, d18u8);
  d6u8 = veor_u8(d6u8, d18u8);
  d7u8 = veor_u8(d7u8, d18u8);
  d16u8 = veor_u8(d16u8, d18u8);
  d28u8 = vshr_n_u8(d28u8, 1);
  d17u8 = vqadd_u8(d17u8, d28u8);
  d19u8 = vdup_n_u8(3);
  d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
  d17u8 = vcge_u8(dblimit, d17u8);
  d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
  d22u8 = vorr_u8(d21u8, d22u8);
  q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
  d23u8 = vand_u8(d23u8, d17u8);
  q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
  d17u8 = vdup_n_u8(4);
  d27s8 = vqmovn_s16(q12s16);
  d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
  d27s8 = vreinterpret_s8_u8(d27u8);
  d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
  d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
  d28s8 = vshr_n_s8(d28s8, 3);
  d27s8 = vshr_n_s8(d27s8, 3);
  d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
  d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
  d27s8 = vrshr_n_s8(d27s8, 1);
  d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
  d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
  d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
  *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
  *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
  *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
  *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
  return;
 }
 void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh) {
  int i;
  uint8_t *s, *psrc;
  uint8x8_t dblimit, dlimit, dthresh;
  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
  dblimit = vld1_u8(blimit);
  dlimit = vld1_u8(limit);
  dthresh = vld1_u8(thresh);
  psrc = src - (pitch << 2);
  for (i = 0; i < 1; i++) {
    s = psrc + i * 8;
    d3u8 = vld1_u8(s);
    s += pitch;
    d4u8 = vld1_u8(s);
    s += pitch;
    d5u8 = vld1_u8(s);
    s += pitch;
    d6u8 = vld1_u8(s);
    s += pitch;
    d7u8 = vld1_u8(s);
    s += pitch;
    d16u8 = vld1_u8(s);
    s += pitch;
    d17u8 = vld1_u8(s);
    s += pitch;
    d18u8 = vld1_u8(s);
    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
    s -= (pitch * 5);
    vst1_u8(s, d4u8);
    s += pitch;
    vst1_u8(s, d5u8);
    s += pitch;
    vst1_u8(s, d6u8);
    s += pitch;
    vst1_u8(s, d7u8);
  }
  return;
 }
 void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
  int i, pitch8;
  uint8_t *s;
  uint8x8_t dblimit, dlimit, dthresh;
  uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
  uint8x8x4_t d4Result;
  dblimit = vld1_u8(blimit);
  dlimit = vld1_u8(limit);
  dthresh = vld1_u8(thresh);
  pitch8 = pitch * 8;
  for (i = 0; i < 1; i++, src += pitch8) {
    s = src - (i + 1) * 4;
    d3u8 = vld1_u8(s);
    s += pitch;
    d4u8 = vld1_u8(s);
    s += pitch;
    d5u8 = vld1_u8(s);
    s += pitch;
    d6u8 = vld1_u8(s);
    s += pitch;
    d7u8 = vld1_u8(s);
    s += pitch;
    d16u8 = vld1_u8(s);
    s += pitch;
    d17u8 = vld1_u8(s);
    s += pitch;
    d18u8 = vld1_u8(s);
    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
                      vreinterpret_u16_u32(d2tmp2.val[0]));
    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
                      vreinterpret_u16_u32(d2tmp3.val[0]));
    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
                      vreinterpret_u16_u32(d2tmp2.val[1]));
    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
                      vreinterpret_u16_u32(d2tmp3.val[1]));
    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
                     vreinterpret_u8_u16(d2tmp5.val[0]));
    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
                     vreinterpret_u8_u16(d2tmp5.val[1]));
    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
                      vreinterpret_u8_u16(d2tmp7.val[0]));
    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
                      vreinterpret_u8_u16(d2tmp7.val[1]));
    d3u8 = d2tmp8.val[0];
    d4u8 = d2tmp8.val[1];
    d5u8 = d2tmp9.val[0];
    d6u8 = d2tmp9.val[1];
    d7u8 = d2tmp10.val[0];
    d16u8 = d2tmp10.val[1];
    d17u8 = d2tmp11.val[0];
    d18u8 = d2tmp11.val[1];
    loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
                     d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
    d4Result.val[0] = d4u8;
    d4Result.val[1] = d5u8;
    d4Result.val[2] = d6u8;
    d4Result.val[3] = d7u8;
    src -= 2;
    vst4_lane_u8(src, d4Result, 0);
    src += pitch;
    vst4_lane_u8(src, d4Result, 1);
    src += pitch;
    vst4_lane_u8(src, d4Result, 2);
    src += pitch;
    vst4_lane_u8(src, d4Result, 3);
    src += pitch;
    vst4_lane_u8(src, d4Result, 4);
    src += pitch;
    vst4_lane_u8(src, d4Result, 5);
    src += pitch;
    vst4_lane_u8(src, d4Result, 6);
    src += pitch;
    vst4_lane_u8(src, d4Result, 7);
  }
  return;
 }
--- a/aom_dsp/arm/loopfilter_8_neon.asm
+++ b/aom_dsp/arm/loopfilter_8_neon.asm
@@ -1,23 +1,26 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_lpf_horizontal_8_neon|
+;
-    EXPORT  |vpx_lpf_vertical_8_neon|
+
    EXPORT  |aom_lpf_horizontal_8_neon|
    EXPORT  |aom_lpf_vertical_8_neon|
    ARM
    AREA ||.text||, CODE, READONLY, ALIGN=2
-; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
 ; works on 16 iterations at a time.
 ;
-; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
+; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
 ;                                const uint8_t *thresh)
@@ -26,7 +29,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_horizontal_8_neon| PROC
+|aom_lpf_horizontal_8_neon| PROC
    push        {r4-r5, lr}
    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
@@ -51,7 +54,7 @@
    sub         r3, r3, r1, lsl #1
    sub         r2, r2, r1, lsl #2
-    bl          vpx_mbloop_filter_neon
+    bl          aom_mbloop_filter_neon
    vst1.u8     {d0}, [r2@64], r1          ; store op2
    vst1.u8     {d1}, [r3@64], r1          ; store op1
@@ -62,9 +65,9 @@
    pop         {r4-r5, pc}
-    ENDP        ; |vpx_lpf_horizontal_8_neon|
+    ENDP        ; |aom_lpf_horizontal_8_neon|
-; void vpx_lpf_vertical_8_neon(uint8_t *s,
+; void aom_lpf_vertical_8_neon(uint8_t *s,
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
@@ -75,7 +78,7 @@
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_vertical_8_neon| PROC
+|aom_lpf_vertical_8_neon| PROC
    push        {r4-r5, lr}
    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
@@ -114,7 +117,7 @@
    sub         r2, r0, #3
    add         r3, r0, #1
-    bl          vpx_mbloop_filter_neon
+    bl          aom_mbloop_filter_neon
    ;store op2, op1, op0, oq0
    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
@@ -137,9 +140,9 @@
    vst2.8      {d4[7], d5[7]}, [r3]
    pop         {r4-r5, pc}
-    ENDP        ; |vpx_lpf_vertical_8_neon|
+    ENDP        ; |aom_lpf_vertical_8_neon|
-; void vpx_mbloop_filter_neon();
+; void aom_mbloop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store. The function does not use
 ; registers d8-d15.
@@ -165,7 +168,7 @@
 ; d3    oq0
 ; d4    oq1
 ; d5    oq2
-|vpx_mbloop_filter_neon| PROC
+|aom_mbloop_filter_neon| PROC
    ; filter_mask
    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
@@ -420,6 +423,6 @@ filter_branch_only
    bx          lr
-    ENDP        ; |vpx_mbloop_filter_neon|
+    ENDP        ; |aom_mbloop_filter_neon|
    END
--- a/aom_dsp/arm/loopfilter_8_neon.c
+++ b/aom_dsp/arm/loopfilter_8_neon.c
@@ -0,0 +1,430 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_dsp_rtcd.h"
 static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
                                      uint8x8_t dlimit,    // limit
                                      uint8x8_t dthresh,   // thresh
                                      uint8x8_t d3u8,      // p2
                                      uint8x8_t d4u8,      // p2
                                      uint8x8_t d5u8,      // p1
                                      uint8x8_t d6u8,      // p0
                                      uint8x8_t d7u8,      // q0
                                      uint8x8_t d16u8,     // q1
                                      uint8x8_t d17u8,     // q2
                                      uint8x8_t d18u8,     // q3
                                      uint8x8_t *d0ru8,    // p1
                                      uint8x8_t *d1ru8,    // p1
                                      uint8x8_t *d2ru8,    // p0
                                      uint8x8_t *d3ru8,    // q0
                                      uint8x8_t *d4ru8,    // q1
                                      uint8x8_t *d5ru8) {  // q1
  uint32_t flat;
  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
  int16x8_t q15s16;
  uint16x8_t q10u16, q14u16;
  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
  d19u8 = vabd_u8(d3u8, d4u8);
  d20u8 = vabd_u8(d4u8, d5u8);
  d21u8 = vabd_u8(d5u8, d6u8);
  d22u8 = vabd_u8(d16u8, d7u8);
  d23u8 = vabd_u8(d17u8, d16u8);
  d24u8 = vabd_u8(d18u8, d17u8);
  d19u8 = vmax_u8(d19u8, d20u8);
  d20u8 = vmax_u8(d21u8, d22u8);
  d25u8 = vabd_u8(d6u8, d4u8);
  d23u8 = vmax_u8(d23u8, d24u8);
  d26u8 = vabd_u8(d7u8, d17u8);
  d19u8 = vmax_u8(d19u8, d20u8);
  d24u8 = vabd_u8(d6u8, d7u8);
  d27u8 = vabd_u8(d3u8, d6u8);
  d28u8 = vabd_u8(d18u8, d7u8);
  d19u8 = vmax_u8(d19u8, d23u8);
  d23u8 = vabd_u8(d5u8, d16u8);
  d24u8 = vqadd_u8(d24u8, d24u8);
  d19u8 = vcge_u8(dlimit, d19u8);
  d25u8 = vmax_u8(d25u8, d26u8);
  d26u8 = vmax_u8(d27u8, d28u8);
  d23u8 = vshr_n_u8(d23u8, 1);
  d25u8 = vmax_u8(d25u8, d26u8);
  d24u8 = vqadd_u8(d24u8, d23u8);
  d20u8 = vmax_u8(d20u8, d25u8);
  d23u8 = vdup_n_u8(1);
  d24u8 = vcge_u8(dblimit, d24u8);
  d21u8 = vcgt_u8(d21u8, dthresh);
  d20u8 = vcge_u8(d23u8, d20u8);
  d19u8 = vand_u8(d19u8, d24u8);
  d23u8 = vcgt_u8(d22u8, dthresh);
  d20u8 = vand_u8(d20u8, d19u8);
  d22u8 = vdup_n_u8(0x80);
  d23u8 = vorr_u8(d21u8, d23u8);
  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
  d30u8 = vshrn_n_u16(q10u16, 4);
  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
    d27u8 = vdup_n_u8(3);
    d21u8 = vdup_n_u8(2);
    q14u16 = vaddl_u8(d6u8, d7u8);
    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    *d0ru8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    *d1ru8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    *d2ru8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d3ru8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vsubw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d4ru8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vsubw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
  } else {
    d21u8 = veor_u8(d7u8, d22u8);
    d24u8 = veor_u8(d6u8, d22u8);
    d25u8 = veor_u8(d5u8, d22u8);
    d26u8 = veor_u8(d16u8, d22u8);
    d27u8 = vdup_n_u8(3);
    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
    q15s16 = vaddw_s8(q15s16, d29s8);
    d29u8 = vdup_n_u8(4);
    d28s8 = vqmovn_s16(q15s16);
    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
    d30s8 = vshr_n_s8(d30s8, 3);
    d29s8 = vshr_n_s8(d29s8, 3);
    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
    d29s8 = vrshr_n_s8(d29s8, 1);
    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
    if (flat == 0) {  // filter_branch_only
      *d0ru8 = d4u8;
      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
      *d5ru8 = d17u8;
      return;
    }
    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
    d23u8 = vdup_n_u8(2);
    q14u16 = vaddl_u8(d6u8, d7u8);
    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
    d30u8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
    d31u8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
    d23u8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d7u8);
    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
    d22u8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vsubw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
    d6u8 = vqrshrn_n_u16(q14u16, 3);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vsubw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
    d7u8 = vqrshrn_n_u16(q14u16, 3);
    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
  }
  return;
 }
 void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh) {
  int i;
  uint8_t *s, *psrc;
  uint8x8_t dblimit, dlimit, dthresh;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
  uint8x8_t d16u8, d17u8, d18u8;
  dblimit = vld1_u8(blimit);
  dlimit = vld1_u8(limit);
  dthresh = vld1_u8(thresh);
  psrc = src - (pitch << 2);
  for (i = 0; i < 1; i++) {
    s = psrc + i * 8;
    d3u8 = vld1_u8(s);
    s += pitch;
    d4u8 = vld1_u8(s);
    s += pitch;
    d5u8 = vld1_u8(s);
    s += pitch;
    d6u8 = vld1_u8(s);
    s += pitch;
    d7u8 = vld1_u8(s);
    s += pitch;
    d16u8 = vld1_u8(s);
    s += pitch;
    d17u8 = vld1_u8(s);
    s += pitch;
    d18u8 = vld1_u8(s);
    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
                       &d5u8);
    s -= (pitch * 6);
    vst1_u8(s, d0u8);
    s += pitch;
    vst1_u8(s, d1u8);
    s += pitch;
    vst1_u8(s, d2u8);
    s += pitch;
    vst1_u8(s, d3u8);
    s += pitch;
    vst1_u8(s, d4u8);
    s += pitch;
    vst1_u8(s, d5u8);
  }
  return;
 }
 void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
  int i;
  uint8_t *s;
  uint8x8_t dblimit, dlimit, dthresh;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
  uint8x8_t d16u8, d17u8, d18u8;
  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
  uint8x8x4_t d4Result;
  uint8x8x2_t d2Result;
  dblimit = vld1_u8(blimit);
  dlimit = vld1_u8(limit);
  dthresh = vld1_u8(thresh);
  for (i = 0; i < 1; i++) {
    s = src + (i * (pitch << 3)) - 4;
    d3u8 = vld1_u8(s);
    s += pitch;
    d4u8 = vld1_u8(s);
    s += pitch;
    d5u8 = vld1_u8(s);
    s += pitch;
    d6u8 = vld1_u8(s);
    s += pitch;
    d7u8 = vld1_u8(s);
    s += pitch;
    d16u8 = vld1_u8(s);
    s += pitch;
    d17u8 = vld1_u8(s);
    s += pitch;
    d18u8 = vld1_u8(s);
    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
                      vreinterpret_u16_u32(d2tmp2.val[0]));
    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
                      vreinterpret_u16_u32(d2tmp3.val[0]));
    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
                      vreinterpret_u16_u32(d2tmp2.val[1]));
    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
                      vreinterpret_u16_u32(d2tmp3.val[1]));
    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
                     vreinterpret_u8_u16(d2tmp5.val[0]));
    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
                     vreinterpret_u8_u16(d2tmp5.val[1]));
    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
                      vreinterpret_u8_u16(d2tmp7.val[0]));
    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
                      vreinterpret_u8_u16(d2tmp7.val[1]));
    d3u8 = d2tmp8.val[0];
    d4u8 = d2tmp8.val[1];
    d5u8 = d2tmp9.val[0];
    d6u8 = d2tmp9.val[1];
    d7u8 = d2tmp10.val[0];
    d16u8 = d2tmp10.val[1];
    d17u8 = d2tmp11.val[0];
    d18u8 = d2tmp11.val[1];
    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
                       &d5u8);
    d4Result.val[0] = d0u8;
    d4Result.val[1] = d1u8;
    d4Result.val[2] = d2u8;
    d4Result.val[3] = d3u8;
    d2Result.val[0] = d4u8;
    d2Result.val[1] = d5u8;
    s = src - 3;
    vst4_lane_u8(s, d4Result, 0);
    s += pitch;
    vst4_lane_u8(s, d4Result, 1);
    s += pitch;
    vst4_lane_u8(s, d4Result, 2);
    s += pitch;
    vst4_lane_u8(s, d4Result, 3);
    s += pitch;
    vst4_lane_u8(s, d4Result, 4);
    s += pitch;
    vst4_lane_u8(s, d4Result, 5);
    s += pitch;
    vst4_lane_u8(s, d4Result, 6);
    s += pitch;
    vst4_lane_u8(s, d4Result, 7);
    s = src + 1;
    vst2_lane_u8(s, d2Result, 0);
    s += pitch;
    vst2_lane_u8(s, d2Result, 1);
    s += pitch;
    vst2_lane_u8(s, d2Result, 2);
    s += pitch;
    vst2_lane_u8(s, d2Result, 3);
    s += pitch;
    vst2_lane_u8(s, d2Result, 4);
    s += pitch;
    vst2_lane_u8(s, d2Result, 5);
    s += pitch;
    vst2_lane_u8(s, d2Result, 6);
    s += pitch;
    vst2_lane_u8(s, d2Result, 7);
  }
  return;
 }
--- a/aom_dsp/arm/loopfilter_mb_neon.asm
+++ b/aom_dsp/arm/loopfilter_mb_neon.asm
@@ -1,16 +1,19 @@
 ;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
-    EXPORT  |vpx_lpf_horizontal_edge_8_neon|
+;
-    EXPORT  |vpx_lpf_horizontal_edge_16_neon|
+
-    EXPORT  |vpx_lpf_vertical_16_neon|
+    EXPORT  |aom_lpf_horizontal_edge_8_neon|
    EXPORT  |aom_lpf_horizontal_edge_16_neon|
    EXPORT  |aom_lpf_vertical_16_neon|
    ARM
    AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -55,7 +58,7 @@ h_count
    vld1.u8     {d14}, [r8@64], r1         ; q6
    vld1.u8     {d15}, [r8@64], r1         ; q7
-    bl          vpx_wide_mbfilter_neon
+    bl          aom_wide_mbfilter_neon
    tst         r7, #1
    beq         h_mbfilter
@@ -118,7 +121,7 @@ h_next
    ENDP        ; |mb_lpf_horizontal_edge|
-; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
+; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
 ;                                     const uint8_t *blimit,
 ;                                     const uint8_t *limit,
 ;                                     const uint8_t *thresh)
@@ -127,12 +130,12 @@ h_next
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_edge_8_neon| PROC
+|aom_lpf_horizontal_edge_8_neon| PROC
    mov r12, #1
    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|
+    ENDP        ; |aom_lpf_horizontal_edge_8_neon|
-; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
+; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
 ;                                      const uint8_t *blimit,
 ;                                      const uint8_t *limit,
 ;                                      const uint8_t *thresh)
@@ -141,12 +144,12 @@ h_next
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh
-|vpx_lpf_horizontal_edge_16_neon| PROC
+|aom_lpf_horizontal_edge_16_neon| PROC
    mov r12, #2
    b mb_lpf_horizontal_edge
-    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|
+    ENDP        ; |aom_lpf_horizontal_edge_16_neon|
-; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
+; void aom_lpf_vertical_16_neon(uint8_t *s, int p,
 ;                               const uint8_t *blimit,
 ;                               const uint8_t *limit,
 ;                               const uint8_t *thresh)
@@ -155,7 +158,7 @@ h_next
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_vertical_16_neon| PROC
+|aom_lpf_vertical_16_neon| PROC
    push        {r4-r8, lr}
    vpush       {d8-d15}
    ldr         r4, [sp, #88]              ; load thresh
@@ -205,7 +208,7 @@ h_next
    vtrn.8      d12, d13
    vtrn.8      d14, d15
-    bl          vpx_wide_mbfilter_neon
+    bl          aom_wide_mbfilter_neon
    tst         r7, #1
    beq         v_mbfilter
@@ -308,9 +311,9 @@ v_end
    vpop        {d8-d15}
    pop         {r4-r8, pc}
-    ENDP        ; |vpx_lpf_vertical_16_neon|
+    ENDP        ; |aom_lpf_vertical_16_neon|
-; void vpx_wide_mbfilter_neon();
+; void aom_wide_mbfilter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
 ;
@@ -334,7 +337,7 @@ v_end
 ; d13   q5
 ; d14   q6
 ; d15   q7
-|vpx_wide_mbfilter_neon| PROC
+|aom_wide_mbfilter_neon| PROC
    mov         r7, #0
    ; filter_mask
@@ -630,6 +633,6 @@ v_end
    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
    bx          lr
-    ENDP        ; |vpx_wide_mbfilter_neon|
+    ENDP        ; |aom_wide_mbfilter_neon|
    END
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,49 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_dsp_rtcd.h"
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
  aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
  aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 #if HAVE_NEON_ASM
 void aom_lpf_horizontal_8_dual_neon(
    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
    const uint8_t *limit1, const uint8_t *thresh1) {
  aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
  aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
 }
 void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
  aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
  aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit,
                                   const uint8_t *thresh) {
  aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
  aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
 }
 #endif  // HAVE_NEON_ASM
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -1,25 +1,26 @@
 /*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+  const uint32x4_t vec_l_lo =
-                                        vget_high_u16(vec_lo));
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+  const uint32x4_t vec_l_hi =
-                                        vget_high_u16(vec_hi));
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
  const uint64x2_t b = vpaddlq_u32(a);
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@@ -33,8 +34,7 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
 static void sad_neon_64(const uint8x16_t vec_src_00,
                        const uint8x16_t vec_src_16,
                        const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48,
+                        const uint8x16_t vec_src_48, const uint8_t *ref,
                        const uint8_t *ref,
                        uint16x8_t *vec_sum_ref_lo,
                        uint16x8_t *vec_sum_ref_hi) {
  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@@ -63,8 +63,7 @@ static void sad_neon_64(const uint8x16_t vec_src_00,
 // Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
 // and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
 static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_16, const uint8_t *ref,
                        const uint8_t *ref,
                        uint16x8_t *vec_sum_ref_lo,
                        uint16x8_t *vec_sum_ref_hi) {
  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
@@ -80,8 +79,8 @@ static void sad_neon_32(const uint8x16_t vec_src_00,
                             vget_high_u8(vec_ref_16));
 }
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t* const ref[4], int ref_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                          uint32_t *res) {
  int i;
  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -126,8 +125,8 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
 }
-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t* const ref[4], int ref_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                          uint32_t *res) {
  int i;
  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -148,14 +147,14 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
    const uint8x16_t vec_src_00 = vld1q_u8(src);
    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    sad_neon_32(vec_src_00, vec_src_16, ref0,
+    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+                &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1,
+    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+                &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2,
+    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+                &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3,
+    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+                &vec_sum_ref3_hi);
    src += src_stride;
    ref0 += ref_stride;
@@ -170,8 +169,8 @@ void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
 }
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t* const ref[4], int ref_stride,
+                          const uint8_t *const ref[4], int ref_stride,
                          uint32_t *res) {
  int i;
  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
@@ -195,20 +194,20 @@ void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
-    vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+    vec_sum_ref0_lo =
-                               vget_low_u8(vec_ref0));
+        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
                               vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+    vec_sum_ref1_lo =
-                               vget_low_u8(vec_ref1));
+        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
                               vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+    vec_sum_ref2_lo =
-                               vget_low_u8(vec_ref2));
+        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
                               vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+    vec_sum_ref3_lo =
-                               vget_low_u8(vec_ref3));
+        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
                               vget_high_u8(vec_ref3));
--- a/aom_dsp/arm/sad_media.asm
+++ b/aom_dsp/arm/sad_media.asm
@@ -1,15 +1,18 @@
 ;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
-    EXPORT  |vpx_sad16x16_media|
+    EXPORT  |aom_sad16x16_media|
    ARM
    REQUIRE8
@@ -21,7 +24,7 @@
 ; r1    int  src_stride
 ; r2    const unsigned char *ref_ptr
 ; r3    int  ref_stride
-|vpx_sad16x16_media| PROC
+|aom_sad16x16_media| PROC
    stmfd   sp!, {r4-r12, lr}
    pld     [r0, r1, lsl #0]
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -1,127 +1,119 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-unsigned int vpx_sad8x16_neon(
+unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
-        unsigned char *src_ptr,
+                              unsigned char *ref_ptr, int ref_stride) {
-        int src_stride,
+  uint8x8_t d0, d8;
-        unsigned char *ref_ptr,
+  uint16x8_t q12;
-        int ref_stride) {
+  uint32x4_t q1;
-    uint8x8_t d0, d8;
+  uint64x2_t q3;
-    uint16x8_t q12;
+  uint32x2_t d5;
-    uint32x4_t q1;
+  int i;
    uint64x2_t q3;
    uint32x2_t d5;
    int i;
  d0 = vld1_u8(src_ptr);
  src_ptr += src_stride;
  d8 = vld1_u8(ref_ptr);
  ref_ptr += ref_stride;
  q12 = vabdl_u8(d0, d8);
  for (i = 0; i < 15; i++) {
    d0 = vld1_u8(src_ptr);
    src_ptr += src_stride;
    d8 = vld1_u8(ref_ptr);
    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
+    q12 = vabal_u8(q12, d0, d8);
  }
-    for (i = 0; i < 15; i++) {
+  q1 = vpaddlq_u16(q12);
-        d0 = vld1_u8(src_ptr);
+  q3 = vpaddlq_u32(q1);
-        src_ptr += src_stride;
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-        d8 = vld1_u8(ref_ptr);
+                vreinterpret_u32_u64(vget_high_u64(q3)));
        ref_ptr += ref_stride;
        q12 = vabal_u8(q12, d0, d8);
    }
-    q1 = vpaddlq_u16(q12);
+  return vget_lane_u32(d5, 0);
    q3 = vpaddlq_u32(q1);
    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
                  vreinterpret_u32_u64(vget_high_u64(q3)));
    return vget_lane_u32(d5, 0);
 }
-unsigned int vpx_sad4x4_neon(
+unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
-        unsigned char *src_ptr,
+                             unsigned char *ref_ptr, int ref_stride) {
-        int src_stride,
+  uint8x8_t d0, d8;
-        unsigned char *ref_ptr,
+  uint16x8_t q12;
-        int ref_stride) {
+  uint32x2_t d1;
-    uint8x8_t d0, d8;
+  uint64x1_t d3;
-    uint16x8_t q12;
+  int i;
    uint32x2_t d1;
    uint64x1_t d3;
    int i;
  d0 = vld1_u8(src_ptr);
  src_ptr += src_stride;
  d8 = vld1_u8(ref_ptr);
  ref_ptr += ref_stride;
  q12 = vabdl_u8(d0, d8);
  for (i = 0; i < 3; i++) {
    d0 = vld1_u8(src_ptr);
    src_ptr += src_stride;
    d8 = vld1_u8(ref_ptr);
    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
+    q12 = vabal_u8(q12, d0, d8);
  }
-    for (i = 0; i < 3; i++) {
+  d1 = vpaddl_u16(vget_low_u16(q12));
-        d0 = vld1_u8(src_ptr);
+  d3 = vpaddl_u32(d1);
        src_ptr += src_stride;
        d8 = vld1_u8(ref_ptr);
        ref_ptr += ref_stride;
        q12 = vabal_u8(q12, d0, d8);
    }
-    d1 = vpaddl_u16(vget_low_u16(q12));
+  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
    d3 = vpaddl_u32(d1);
    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
 }
-unsigned int vpx_sad16x8_neon(
+unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
-        unsigned char *src_ptr,
+                              unsigned char *ref_ptr, int ref_stride) {
-        int src_stride,
+  uint8x16_t q0, q4;
-        unsigned char *ref_ptr,
+  uint16x8_t q12, q13;
-        int ref_stride) {
+  uint32x4_t q1;
-    uint8x16_t q0, q4;
+  uint64x2_t q3;
-    uint16x8_t q12, q13;
+  uint32x2_t d5;
-    uint32x4_t q1;
+  int i;
    uint64x2_t q3;
    uint32x2_t d5;
    int i;
  q0 = vld1q_u8(src_ptr);
  src_ptr += src_stride;
  q4 = vld1q_u8(ref_ptr);
  ref_ptr += ref_stride;
  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
  for (i = 0; i < 7; i++) {
    q0 = vld1q_u8(src_ptr);
    src_ptr += src_stride;
    q4 = vld1q_u8(ref_ptr);
    ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
  }
-    for (i = 0; i < 7; i++) {
+  q12 = vaddq_u16(q12, q13);
-        q0 = vld1q_u8(src_ptr);
+  q1 = vpaddlq_u16(q12);
-        src_ptr += src_stride;
+  q3 = vpaddlq_u32(q1);
-        q4 = vld1q_u8(ref_ptr);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-        ref_ptr += ref_stride;
+                vreinterpret_u32_u64(vget_high_u64(q3)));
        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
    }
-    q12 = vaddq_u16(q12, q13);
+  return vget_lane_u32(d5, 0);
    q1 = vpaddlq_u16(q12);
    q3 = vpaddlq_u32(q1);
    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
                  vreinterpret_u32_u64(vget_high_u64(q3)));
    return vget_lane_u32(d5, 0);
 }
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+  const uint32x4_t vec_l_lo =
-                                        vget_high_u16(vec_lo));
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+  const uint32x4_t vec_l_hi =
-                                        vget_high_u16(vec_hi));
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
  const uint64x2_t b = vpaddlq_u32(a);
  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
@@ -136,7 +128,7 @@ static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
  return vget_lane_u32(c, 0);
 }
-unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
+unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
                               const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -172,7 +164,7 @@ unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
 }
-unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
+unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
                               const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -197,7 +189,7 @@ unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
-unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
+unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
                               const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -208,15 +200,15 @@ unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
    const uint8x16_t vec_ref = vld1q_u8(ref);
    src += src_stride;
    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
+    vec_accum_lo =
-                            vget_low_u8(vec_ref));
+        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
+    vec_accum_hi =
-                            vget_high_u8(vec_ref));
+        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
  }
  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
-unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
+unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
                             const uint8_t *ref, int ref_stride) {
  int i;
  uint16x8_t vec_accum = vdupq_n_u16(0);
--- a/aom_dsp/arm/save_reg_neon.asm
+++ b/aom_dsp/arm/save_reg_neon.asm
@@ -0,0 +1,39 @@
 ;
 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
 ; This source code is subject to the terms of the BSD 2 Clause License and
 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 ; was not distributed with this source code in the LICENSE file, you can
 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
 ; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
    EXPORT  |aom_push_neon|
    EXPORT  |aom_pop_neon|
    ARM
    REQUIRE8
    PRESERVE8
    AREA ||.text||, CODE, READONLY, ALIGN=2
 |aom_push_neon| PROC
    vst1.i64            {d8, d9, d10, d11}, [r0]!
    vst1.i64            {d12, d13, d14, d15}, [r0]!
    bx              lr
    ENDP
 |aom_pop_neon| PROC
    vld1.i64            {d8, d9, d10, d11}, [r0]!
    vld1.i64            {d12, d13, d14, d15}, [r0]!
    bx              lr
    ENDP
    END
--- a/aom_dsp/arm/subpel_variance_media.c
+++ b/aom_dsp/arm/subpel_variance_media.c
@@ -0,0 +1,81 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 #if HAVE_MEDIA
 static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
                                                      { 96, 32 }, { 80, 48 },
                                                      { 64, 64 }, { 48, 80 },
                                                      { 32, 96 }, { 16, 112 } };
 extern void aom_filter_block2d_bil_first_pass_media(
    const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
    uint32_t height, uint32_t width, const int16_t *filter);
 extern void aom_filter_block2d_bil_second_pass_media(
    const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
    uint32_t height, uint32_t width, const int16_t *filter);
 unsigned int aom_sub_pixel_variance8x8_media(
    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
  uint16_t first_pass[10 * 8];
  uint8_t second_pass[8 * 8];
  const int16_t *HFilter, *VFilter;
  HFilter = bilinear_filters_media[xoffset];
  VFilter = bilinear_filters_media[yoffset];
  aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
                                          src_pixels_per_line, 9, 8, HFilter);
  aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
                                           VFilter);
  return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
                               sse);
 }
 unsigned int aom_sub_pixel_variance16x16_media(
    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
  uint16_t first_pass[36 * 16];
  uint8_t second_pass[20 * 16];
  const int16_t *HFilter, *VFilter;
  unsigned int var;
  if (xoffset == 4 && yoffset == 0) {
    var = aom_variance_halfpixvar16x16_h_media(
        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
  } else if (xoffset == 0 && yoffset == 4) {
    var = aom_variance_halfpixvar16x16_v_media(
        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
  } else if (xoffset == 4 && yoffset == 4) {
    var = aom_variance_halfpixvar16x16_hv_media(
        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
  } else {
    HFilter = bilinear_filters_media[xoffset];
    VFilter = bilinear_filters_media[yoffset];
    aom_filter_block2d_bil_first_pass_media(
        src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
    aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
                                             16, VFilter);
    var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
                                  sse);
  }
  return var;
 }
 #endif  // HAVE_MEDIA
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -1,31 +1,26 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "./vpx_config.h"
+#include "./aom_config.h"
-#include "vpx_ports/mem.h"
+#include "aom_ports/mem.h"
-#include "vpx/vpx_integer.h"
+#include "aom/aom_integer.h"
-#include "vpx_dsp/variance.h"
+#include "aom_dsp/variance.h"
 static const uint8_t bilinear_filters[8][2] = {
-  { 128,   0, },
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 112,  16, },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
  {  96,  32, },
  {  80,  48, },
  {  64,  64, },
  {  48,  80, },
  {  32,  96, },
  {  16, 112, },
 };
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -79,74 +74,61 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
  }
 }
-unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
+unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
-                                            int src_stride,
+                                            int xoffset, int yoffset,
-                                            int xoffset,
+                                            const uint8_t *dst, int dst_stride,
                                            int yoffset,
                                            const uint8_t *dst,
                                            int dst_stride,
                                            unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
                            9, 8,
                            bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
-                            8, bilinear_filters[yoffset]);
+                            bilinear_filters[yoffset]);
-  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
-unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
+unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride,
+                                              int src_stride, int xoffset,
-                                              int xoffset,
+                                              int yoffset, const uint8_t *dst,
                                              int yoffset,
                                              const uint8_t *dst,
                                              int dst_stride,
                                              unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
                             17, 16,
                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
-                             16, bilinear_filters[yoffset]);
+                             bilinear_filters[yoffset]);
-  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }
-unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
+unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride,
+                                              int src_stride, int xoffset,
-                                              int xoffset,
+                                              int yoffset, const uint8_t *dst,
                                              int yoffset,
                                              const uint8_t *dst,
                                              int dst_stride,
                                              unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
                             33, 32,
                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
-                             32, bilinear_filters[yoffset]);
+                             bilinear_filters[yoffset]);
-  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 }
-unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
+unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
-                                              int src_stride,
+                                              int src_stride, int xoffset,
-                                              int xoffset,
+                                              int yoffset, const uint8_t *dst,
                                              int yoffset,
                                              const uint8_t *dst,
                                              int dst_stride,
                                              unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
                             65, 64,
                             bilinear_filters[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
-                             64, bilinear_filters[yoffset]);
+                             bilinear_filters[yoffset]);
-  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }
--- a/aom_dsp/arm/subtract_neon.c
+++ b/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,80 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
                             ptrdiff_t diff_stride, const uint8_t *src,
                             ptrdiff_t src_stride, const uint8_t *pred,
                             ptrdiff_t pred_stride) {
  int r, c;
  if (cols > 16) {
    for (r = 0; r < rows; ++r) {
      for (c = 0; c < cols; c += 32) {
        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
        const uint16x8_t v_diff_lo_00 =
            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
        const uint16x8_t v_diff_hi_00 =
            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
        const uint16x8_t v_diff_lo_16 =
            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
        const uint16x8_t v_diff_hi_16 =
            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
      }
      diff += diff_stride;
      pred += pred_stride;
      src += src_stride;
    }
  } else if (cols > 8) {
    for (r = 0; r < rows; ++r) {
      const uint8x16_t v_src = vld1q_u8(&src[0]);
      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
      const uint16x8_t v_diff_lo =
          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
      const uint16x8_t v_diff_hi =
          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
      diff += diff_stride;
      pred += pred_stride;
      src += src_stride;
    }
  } else if (cols > 4) {
    for (r = 0; r < rows; ++r) {
      const uint8x8_t v_src = vld1_u8(&src[0]);
      const uint8x8_t v_pred = vld1_u8(&pred[0]);
      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
      diff += diff_stride;
      pred += pred_stride;
      src += src_stride;
    }
  } else {
    for (r = 0; r < rows; ++r) {
      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
      diff += diff_stride;
      pred += pred_stride;
      src += src_stride;
    }
  }
 }
--- a/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
@@ -1,15 +1,18 @@
 ;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
-    EXPORT  |vpx_variance_halfpixvar16x16_h_media|
+    EXPORT  |aom_variance_halfpixvar16x16_h_media|
    ARM
    REQUIRE8
@@ -22,7 +25,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vpx_variance_halfpixvar16x16_h_media| PROC
+|aom_variance_halfpixvar16x16_h_media| PROC
    stmfd   sp!, {r4-r12, lr}
--- a/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
@@ -1,15 +1,18 @@
 ;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
-    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|
+    EXPORT  |aom_variance_halfpixvar16x16_hv_media|
    ARM
    REQUIRE8
@@ -22,7 +25,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vpx_variance_halfpixvar16x16_hv_media| PROC
+|aom_variance_halfpixvar16x16_hv_media| PROC
    stmfd   sp!, {r4-r12, lr}
--- a/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
@@ -1,15 +1,18 @@
 ;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
-    EXPORT  |vpx_variance_halfpixvar16x16_v_media|
+    EXPORT  |aom_variance_halfpixvar16x16_v_media|
    ARM
    REQUIRE8
@@ -22,7 +25,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vpx_variance_halfpixvar16x16_v_media| PROC
+|aom_variance_halfpixvar16x16_v_media| PROC
    stmfd   sp!, {r4-r12, lr}
--- a/aom_dsp/arm/variance_media.asm
+++ b/aom_dsp/arm/variance_media.asm
@@ -1,17 +1,20 @@
 ;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
 ;
-;  Use of this source code is governed by a BSD-style license
+; This source code is subject to the terms of the BSD 2 Clause License and
-;  that can be found in the LICENSE file in the root of the source
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-;  tree. An additional intellectual property rights grant can be found
+; was not distributed with this source code in the LICENSE file, you can
-;  in the file PATENTS.  All contributing project authors may
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-;  be found in the AUTHORS file in the root of the source tree.
+; Media Patent License 1.0 was not distributed with this source code in the
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 ;
-    EXPORT  |vpx_variance16x16_media|
+    EXPORT  |aom_variance16x16_media|
-    EXPORT  |vpx_variance8x8_media|
+    EXPORT  |aom_variance8x8_media|
-    EXPORT  |vpx_mse16x16_media|
+    EXPORT  |aom_mse16x16_media|
    ARM
    REQUIRE8
@@ -24,7 +27,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vpx_variance16x16_media| PROC
+|aom_variance16x16_media| PROC
    stmfd   sp!, {r4-r12, lr}
@@ -157,7 +160,7 @@ loop16x16
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vpx_variance8x8_media| PROC
+|aom_variance8x8_media| PROC
    push    {r4-r10, lr}
@@ -241,10 +244,10 @@ loop8x8
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 ;
-;note: Based on vpx_variance16x16_media. In this function, sum is never used.
+;note: Based on aom_variance16x16_media. In this function, sum is never used.
 ;      So, we can remove this part of calculation.
-|vpx_mse16x16_media| PROC
+|aom_mse16x16_media| PROC
    push    {r4-r9, lr}
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,400 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <arm_neon.h>
 #include "./aom_dsp_rtcd.h"
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
  const int32x4_t a = vpaddlq_s16(v_16x8);
  const int64x2_t b = vpaddlq_s32(a);
  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
                               vreinterpret_s32_s64(vget_high_s64(b)));
  return vget_lane_s32(c, 0);
 }
 static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
  const int64x2_t b = vpaddlq_s32(v_32x4);
  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
                               vreinterpret_s32_s64(vget_high_s64(b)));
  return vget_lane_s32(c, 0);
 }
 // w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride, int w, int h, uint32_t *sse,
                             int *sum) {
  int i, j;
  int16x8_t v_sum = vdupq_n_s16(0);
  int32x4_t v_sse_lo = vdupq_n_s32(0);
  int32x4_t v_sse_hi = vdupq_n_s32(0);
  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; j += 8) {
      const uint8x8_t v_a = vld1_u8(&a[j]);
      const uint8x8_t v_b = vld1_u8(&b[j]);
      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
      v_sum = vaddq_s16(v_sum, sv_diff);
      v_sse_lo =
          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
      v_sse_hi =
          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
    }
    a += a_stride;
    b += b_stride;
  }
  *sum = horizontal_add_s16x8(v_sum);
  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
 }
 void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                        int b_stride, unsigned int *sse, int *sum) {
  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
 }
 void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
                          int b_stride, unsigned int *sse, int *sum) {
  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
 }
 unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
                                  const uint8_t *b, int b_stride,
                                  unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
 }
 unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
 }
 unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum;
  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
 }
 unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
                   32, 32, &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
                   64, 16, &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
 }
 unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
                                    const uint8_t *b, int b_stride,
                                    unsigned int *sse) {
  int sum1, sum2;
  uint32_t sse1, sse2;
  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
                   64, 16, &sse2, &sum2);
  sse1 += sse2;
  sum1 += sum2;
  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
                   b_stride, 64, 16, &sse2, &sum2);
  sse1 += sse2;
  sum1 += sum2;
  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
                   b_stride, 64, 16, &sse2, &sum2);
  *sse = sse1 + sse2;
  sum1 += sum2;
  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
                                   int source_stride,
                                   const unsigned char *ref_ptr,
                                   int recon_stride, unsigned int *sse) {
  int i;
  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
  uint32x2_t d0u32, d10u32;
  int64x1_t d0s64, d1s64;
  uint8x16_t q0u8, q1u8, q2u8, q3u8;
  uint16x8_t q11u16, q12u16, q13u16, q14u16;
  int32x4_t q8s32, q9s32, q10s32;
  int64x2_t q0s64, q1s64, q5s64;
  q8s32 = vdupq_n_s32(0);
  q9s32 = vdupq_n_s32(0);
  q10s32 = vdupq_n_s32(0);
  for (i = 0; i < 4; i++) {
    q0u8 = vld1q_u8(src_ptr);
    src_ptr += source_stride;
    q1u8 = vld1q_u8(src_ptr);
    src_ptr += source_stride;
    __builtin_prefetch(src_ptr);
    q2u8 = vld1q_u8(ref_ptr);
    ref_ptr += recon_stride;
    q3u8 = vld1q_u8(ref_ptr);
    ref_ptr += recon_stride;
    __builtin_prefetch(ref_ptr);
    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
  }
  q10s32 = vaddq_s32(q10s32, q9s32);
  q0s64 = vpaddlq_s32(q8s32);
  q1s64 = vpaddlq_s32(q10s32);
  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
  return vget_lane_u32(d0u32, 0);
 }
 unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
                                   int source_stride,
                                   const unsigned char *ref_ptr,
                                   int recon_stride, unsigned int *sse) {
  int i;
  uint8x8_t d0u8, d2u8, d4u8, d6u8;
  int16x4_t d22s16, d23s16, d24s16, d25s16;
  uint32x2_t d0u32, d10u32;
  int64x1_t d0s64, d1s64;
  uint16x8_t q11u16, q12u16;
  int32x4_t q8s32, q9s32, q10s32;
  int64x2_t q0s64, q1s64, q5s64;
  q8s32 = vdupq_n_s32(0);
  q9s32 = vdupq_n_s32(0);
  q10s32 = vdupq_n_s32(0);
  for (i = 0; i < 8; i++) {
    d0u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    __builtin_prefetch(src_ptr);
    d4u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d6u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    __builtin_prefetch(ref_ptr);
    q11u16 = vsubl_u8(d0u8, d4u8);
    q12u16 = vsubl_u8(d2u8, d6u8);
    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
  }
  q10s32 = vaddq_s32(q10s32, q9s32);
  q0s64 = vpaddlq_s32(q8s32);
  q1s64 = vpaddlq_s32(q10s32);
  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
  return vget_lane_u32(d0u32, 0);
 }
 unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
                               const unsigned char *ref_ptr, int recon_stride,
                               unsigned int *sse) {
  int i;
  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
  int64x1_t d0s64;
  uint8x16_t q0u8, q1u8, q2u8, q3u8;
  int32x4_t q7s32, q8s32, q9s32, q10s32;
  uint16x8_t q11u16, q12u16, q13u16, q14u16;
  int64x2_t q1s64;
  q7s32 = vdupq_n_s32(0);
  q8s32 = vdupq_n_s32(0);
  q9s32 = vdupq_n_s32(0);
  q10s32 = vdupq_n_s32(0);
  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
    q0u8 = vld1q_u8(src_ptr);
    src_ptr += source_stride;
    q1u8 = vld1q_u8(src_ptr);
    src_ptr += source_stride;
    q2u8 = vld1q_u8(ref_ptr);
    ref_ptr += recon_stride;
    q3u8 = vld1q_u8(ref_ptr);
    ref_ptr += recon_stride;
    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
  }
  q7s32 = vaddq_s32(q7s32, q8s32);
  q9s32 = vaddq_s32(q9s32, q10s32);
  q10s32 = vaddq_s32(q7s32, q9s32);
  q1s64 = vpaddlq_s32(q10s32);
  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
 unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
                                   int source_stride,
                                   const unsigned char *ref_ptr,
                                   int recon_stride) {
  int16x4_t d22s16, d24s16, d26s16, d28s16;
  int64x1_t d0s64;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
  int32x4_t q7s32, q8s32, q9s32, q10s32;
  uint16x8_t q11u16, q12u16, q13u16, q14u16;
  int64x2_t q1s64;
  d0u8 = vld1_u8(src_ptr);
  src_ptr += source_stride;
  d4u8 = vld1_u8(ref_ptr);
  ref_ptr += recon_stride;
  d1u8 = vld1_u8(src_ptr);
  src_ptr += source_stride;
  d5u8 = vld1_u8(ref_ptr);
  ref_ptr += recon_stride;
  d2u8 = vld1_u8(src_ptr);
  src_ptr += source_stride;
  d6u8 = vld1_u8(ref_ptr);
  ref_ptr += recon_stride;
  d3u8 = vld1_u8(src_ptr);
  src_ptr += source_stride;
  d7u8 = vld1_u8(ref_ptr);
  ref_ptr += recon_stride;
  q11u16 = vsubl_u8(d0u8, d4u8);
  q12u16 = vsubl_u8(d1u8, d5u8);
  q13u16 = vsubl_u8(d2u8, d6u8);
  q14u16 = vsubl_u8(d3u8, d7u8);
  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
  q7s32 = vmull_s16(d22s16, d22s16);
  q8s32 = vmull_s16(d24s16, d24s16);
  q9s32 = vmull_s16(d26s16, d26s16);
  q10s32 = vmull_s16(d28s16, d28s16);
  q7s32 = vaddq_s32(q7s32, q8s32);
  q9s32 = vaddq_s32(q9s32, q10s32);
  q9s32 = vaddq_s32(q7s32, q9s32);
  q1s64 = vpaddlq_s32(q9s32);
  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c
@@ -1,31 +1,34 @@
 /*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
- *  Use of this source code is governed by a BSD-style license
+ * This source code is subject to the terms of the BSD 2 Clause License and
- *  that can be found in the LICENSE file in the root of the source
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- *  tree. An additional intellectual property rights grant can be found
+ * was not distributed with this source code in the LICENSE file, you can
- *  in the file PATENTS.  All contributing project authors may
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- *  be found in the AUTHORS file in the root of the source tree.
+ * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <stdlib.h>
-#include "./vpx_dsp_rtcd.h"
+#include "./aom_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
+#include "aom_ports/mem.h"
-unsigned int vpx_avg_8x8_c(const uint8_t *src, int stride) {
+unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
  for (i = 0; i < 8; ++i, src += stride)
-    for (j = 0; j < 8; sum += src[j], ++j) {}
+    for (j = 0; j < 8; sum += src[j], ++j) {
    }
  return ROUND_POWER_OF_TWO(sum, 6);
 }
-unsigned int vpx_avg_4x4_c(const uint8_t *src, int stride) {
+unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
  for (i = 0; i < 4; ++i, src += stride)
-    for (j = 0; j < 4; sum += src[j], ++j) {}
+    for (j = 0; j < 4; sum += src[j], ++j) {
    }
  return ROUND_POWER_OF_TWO(sum, 4);
 }
@@ -64,7 +67,7 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
 // The order of the output coeff of the hadamard is not important. For
 // optimization purposes the final transpose may be skipped.
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
+void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
                        int16_t *coeff) {
  int idx;
  int16_t buffer[64];
@@ -80,21 +83,21 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
  for (idx = 0; idx < 8; ++idx) {
    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
                                       // dynamic range [-2040, 2040]
-    coeff += 8;  // coeff: 15 bit
+    coeff += 8;                        // coeff: 15 bit
-                 // dynamic range [-16320, 16320]
+                                       // dynamic range [-16320, 16320]
    ++tmp_buf;
  }
 }
 // In place 16x16 2D Hadamard transform
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
+void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
                          int16_t *coeff) {
  int idx;
  for (idx = 0; idx < 4; ++idx) {
    // src_diff: 9 bit, dynamic range [-255, 255]
-    const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+    const int16_t *src_ptr =
-                                + (idx & 0x01) * 8;
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
  }
  // coeff: 15 bit, dynamic range [-16320, 16320]
@@ -109,8 +112,8 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
    int16_t b3 = (a2 - a3) >> 1;
-    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
-    coeff[64]  = b1 + b3;
+    coeff[64] = b1 + b3;
    coeff[128] = b0 - b2;
    coeff[192] = b1 - b3;
@@ -120,11 +123,10 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_c(const int16_t *coeff, int length) {
+int aom_satd_c(const int16_t *coeff, int length) {
  int i;
  int satd = 0;
-  for (i = 0; i < length; ++i)
+  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
    satd += abs(coeff[i]);
  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
  return satd;
@@ -132,7 +134,7 @@ int vpx_satd_c(const int16_t *coeff, int length) {
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64}.
-void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                       const int ref_stride, const int height) {
  int idx;
  const int norm_factor = height >> 1;
@@ -140,8 +142,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
    int i;
    hbuf[idx] = 0;
    // hbuf[idx]: 14 bit, dynamic range [0, 16320].
-    for (i = 0; i < height; ++i)
+    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
      hbuf[idx] += ref[i * ref_stride];
    // hbuf[idx]: 9 bit, dynamic range [0, 510].
    hbuf[idx] /= norm_factor;
    ++ref;
@@ -149,20 +150,18 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
 }
 // width: value range {16, 32, 64}.
-int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
+int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
  int idx;
  int16_t sum = 0;
  // sum: 14 bit, dynamic range [0, 16320]
-  for (idx = 0; idx < width; ++idx)
+  for (idx = 0; idx < width; ++idx) sum += ref[idx];
    sum += ref[idx];
  return sum;
 }
 // ref: [0 - 510]
 // src: [0 - 510]
 // bwl: {2, 3, 4}
-int vpx_vector_var_c(const int16_t *ref, const int16_t *src,
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
                     const int bwl) {
  int i;
  int width = 4 << bwl;
  int sse = 0, mean = 0, var;
@@ -178,57 +177,56 @@ int vpx_vector_var_c(const int16_t *ref, const int16_t *src,
  return var;
 }
-void vpx_minmax_8x8_c(const uint8_t *src, int src_stride,
+void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
-                      const uint8_t *ref, int ref_stride,
+                      int ref_stride, int *min, int *max) {
                      int *min, int *max) {
  int i, j;
  *min = 255;
  *max = 0;
  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
    for (j = 0; j < 8; ++j) {
-      int diff = abs(src[j]-ref[j]);
+      int diff = abs(src[j] - ref[j]);
      *min = diff < *min ? diff : *min;
      *max = diff > *max ? diff : *max;
    }
  }
 }
-#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_AOM_HIGHBITDEPTH
-unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) {
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
  for (i = 0; i < 8; ++i, s += stride)
-    for (j = 0; j < 8; sum += s[j], ++j) {}
+    for (j = 0; j < 8; sum += s[j], ++j) {
    }
  return ROUND_POWER_OF_TWO(sum, 6);
 }
-unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) {
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) {
  int i, j;
  int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
-  for (i = 0; i < 4; ++i, s+=stride)
+  for (i = 0; i < 4; ++i, s += stride)
-    for (j = 0; j < 4; sum += s[j], ++j) {}
+    for (j = 0; j < 4; sum += s[j], ++j) {
    }
  return ROUND_POWER_OF_TWO(sum, 4);
 }
-void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
                             int dp, int *min, int *max) {
  int i, j;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
-  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
  *min = 255;
  *max = 0;
  for (i = 0; i < 8; ++i, s += p, d += dp) {
    for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j]-d[j]);
+      int diff = abs(s[j] - d[j]);
      *min = diff < *min ? diff : *min;
      *max = diff > *max ? diff : *max;
    }
  }
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -0,0 +1,240 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_BITREADER_H_
 #define AOM_DSP_BITREADER_H_
 #include <assert.h>
 #include <limits.h>
 #include "./aom_config.h"
 #if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
 #error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
 #endif
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
 #if CONFIG_ANS
 #include "aom_dsp/ansreader.h"
 #elif CONFIG_DAALA_EC
 #include "aom_dsp/daalaboolreader.h"
 #else
 #include "aom_dsp/dkboolreader.h"
 #endif
 #include "aom_dsp/prob.h"
 #include "av1/common/odintrin.h"
 #if CONFIG_ACCOUNTING
 #include "av1/common/accounting.h"
 #define ACCT_STR_NAME acct_str
 #define ACCT_STR_PARAM , const char *ACCT_STR_NAME
 #define ACCT_STR_ARG(s) , s
 #else
 #define ACCT_STR_PARAM
 #define ACCT_STR_ARG(s)
 #endif
 #define aom_read(r, prob, ACCT_STR_NAME) \
  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_bit(r, ACCT_STR_NAME) \
  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_literal(r, bits, ACCT_STR_NAME) \
  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_tree_bits(r, tree, probs, ACCT_STR_NAME) \
  aom_read_tree_bits_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
 #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if CONFIG_ANS
 typedef struct AnsDecoder aom_reader;
 #elif CONFIG_DAALA_EC
 typedef struct daala_reader aom_reader;
 #else
 typedef struct aom_dk_reader aom_reader;
 #endif
 static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
                                  size_t size, aom_decrypt_cb decrypt_cb,
                                  void *decrypt_state) {
 #if CONFIG_ANS
  (void)decrypt_cb;
  (void)decrypt_state;
  assert(size <= INT_MAX);
  return ans_read_init(r, buffer, size);
 #elif CONFIG_DAALA_EC
  (void)decrypt_cb;
  (void)decrypt_state;
  return aom_daala_reader_init(r, buffer, size);
 #else
  return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state);
 #endif
 }
 static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
 #if CONFIG_ANS
  (void)r;
  assert(0 && "Use the raw buffer size with ANS");
  return NULL;
 #elif CONFIG_DAALA_EC
  return aom_daala_reader_find_end(r);
 #else
  return aom_dk_reader_find_end(r);
 #endif
 }
 static INLINE int aom_reader_has_error(aom_reader *r) {
 #if CONFIG_ANS
  return ans_reader_has_error(r);
 #elif CONFIG_DAALA_EC
  return aom_daala_reader_has_error(r);
 #else
  return aom_dk_reader_has_error(r);
 #endif
 }
 // Returns the position in the bit reader in bits.
 static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
 #if CONFIG_ANS
  (void)r;
  assert(0 && "aom_reader_tell() is unimplemented for ANS");
  return 0;
 #elif CONFIG_DAALA_EC
  return aom_daala_reader_tell(r);
 #else
  return aom_dk_reader_tell(r);
 #endif
 }
 // Returns the position in the bit reader in 1/8th bits.
 static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
 #if CONFIG_ANS
  (void)r;
  assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
  return 0;
 #elif CONFIG_DAALA_EC
  return aom_daala_reader_tell_frac(r);
 #else
  return aom_dk_reader_tell_frac(r);
 #endif
 }
 #if CONFIG_ACCOUNTING
 static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
  if (r->accounting != NULL) {
    uint32_t tell_frac;
    tell_frac = aom_reader_tell_frac(r);
    aom_accounting_record(r->accounting, ACCT_STR_NAME,
                          tell_frac - r->accounting->last_tell_frac);
    r->accounting->last_tell_frac = tell_frac;
  }
 }
 #endif
 static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
  int ret;
 #if CONFIG_ANS
  ret = uabs_read(r, prob);
 #elif CONFIG_DAALA_EC
  ret = aom_daala_read(r, prob);
 #else
  ret = aom_dk_read(r, prob);
 #endif
 #if CONFIG_ACCOUNTING
  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
  return ret;
 }
 static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
  int ret;
 #if CONFIG_ANS
  ret = uabs_read_bit(r);  // Non trivial optimization at half probability
 #else
  ret = aom_read(r, 128, NULL);  // aom_prob_half
 #endif
 #if CONFIG_ACCOUNTING
  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
  return ret;
 }
 static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
  int literal = 0, bit;
  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
 #if CONFIG_ACCOUNTING
  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
  return literal;
 }
 static INLINE int aom_read_tree_bits_(aom_reader *r, const aom_tree_index *tree,
                                      const aom_prob *probs ACCT_STR_PARAM) {
  aom_tree_index i = 0;
  while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue;
 #if CONFIG_ACCOUNTING
  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
  return -i;
 }
 static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
                                 const aom_prob *probs ACCT_STR_PARAM) {
  int ret;
 #if CONFIG_DAALA_EC
  ret = daala_read_tree_bits(r, tree, probs);
 #else
  ret = aom_read_tree_bits(r, tree, probs, NULL);
 #endif
 #if CONFIG_ACCOUNTING
  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
  return ret;
 }
 #if CONFIG_EC_MULTISYMBOL
 static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
                                   int nsymbs ACCT_STR_PARAM) {
  int ret;
 #if CONFIG_RANS
  (void)nsymbs;
  ret = rans_read(r, cdf);
 #elif CONFIG_DAALA_EC
  ret = daala_read_symbol(r, cdf, nsymbs);
 #else
 #error \
    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
  "coder. Enable daala_ec or ans for a valid configuration."
 #endif
 #if CONFIG_EC_ADAPT
  update_cdf(cdf, ret, nsymbs);
 #endif
 #if CONFIG_ACCOUNTING
  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
  return ret;
 }
 #endif  // CONFIG_EC_MULTISYMBOL
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_DSP_BITREADER_H_
--- a/aom_dsp/bitreader_buffer.c
+++ b/aom_dsp/bitreader_buffer.c
@@ -0,0 +1,47 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include "./aom_config.h"
 #include "./bitreader_buffer.h"
 size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
  return (rb->bit_offset + 7) >> 3;
 }
 int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
  const size_t off = rb->bit_offset;
  const size_t p = off >> 3;
  const int q = 7 - (int)(off & 0x7);
  if (rb->bit_buffer + p < rb->bit_buffer_end) {
    const int bit = (rb->bit_buffer[p] >> q) & 1;
    rb->bit_offset = off + 1;
    return bit;
  } else {
    rb->error_handler(rb->error_handler_data);
    return 0;
  }
 }
 int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
  int value = 0, bit;
  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
  return value;
 }
 int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
  const int value = aom_rb_read_literal(rb, bits);
  return aom_rb_read_bit(rb) ? -value : value;
 }
 int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
  const int nbits = sizeof(unsigned) * 8 - bits - 1;
  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
  return ((int)value) >> nbits;
 }
--- a/aom_dsp/bitreader_buffer.h
+++ b/aom_dsp/bitreader_buffer.h
@@ -0,0 +1,48 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_BITREADER_BUFFER_H_
 #define AOM_DSP_BITREADER_BUFFER_H_
 #include <limits.h>
 #include "aom/aom_integer.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef void (*aom_rb_error_handler)(void *data);
 struct aom_read_bit_buffer {
  const uint8_t *bit_buffer;
  const uint8_t *bit_buffer_end;
  size_t bit_offset;
  void *error_handler_data;
  aom_rb_error_handler error_handler;
 };
 size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
 int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
 int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
 int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
 int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_DSP_BITREADER_BUFFER_H_
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -0,0 +1,179 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_BITWRITER_H_
 #define AOM_DSP_BITWRITER_H_
 #include <assert.h>
 #include "./aom_config.h"
 #if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
 #error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
 #endif
 #if CONFIG_ANS
 #include "aom_dsp/buf_ans.h"
 #elif CONFIG_DAALA_EC
 #include "aom_dsp/daalaboolwriter.h"
 #else
 #include "aom_dsp/dkboolwriter.h"
 #endif
 #include "aom_dsp/prob.h"
 #if CONFIG_RD_DEBUG
 #include "av1/encoder/cost.h"
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if CONFIG_ANS
 typedef struct BufAnsCoder aom_writer;
 #elif CONFIG_DAALA_EC
 typedef struct daala_writer aom_writer;
 #else
 typedef struct aom_dk_writer aom_writer;
 #endif
 typedef struct TOKEN_STATS { int64_t cost; } TOKEN_STATS;
 static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
 #if CONFIG_ANS
  (void)bc;
  (void)buffer;
  assert(0 && "buf_ans requires a more complicated startup procedure");
 #elif CONFIG_DAALA_EC
  aom_daala_start_encode(bc, buffer);
 #else
  aom_dk_start_encode(bc, buffer);
 #endif
 }
 static INLINE void aom_stop_encode(aom_writer *bc) {
 #if CONFIG_ANS
  (void)bc;
  assert(0 && "buf_ans requires a more complicated shutdown procedure");
 #elif CONFIG_DAALA_EC
  aom_daala_stop_encode(bc);
 #else
  aom_dk_stop_encode(bc);
 #endif
 }
 static INLINE void aom_write(aom_writer *br, int bit, int probability) {
 #if CONFIG_ANS
  buf_uabs_write(br, bit, probability);
 #elif CONFIG_DAALA_EC
  aom_daala_write(br, bit, probability);
 #else
  aom_dk_write(br, bit, probability);
 #endif
 }
 static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
                                    TOKEN_STATS *token_stats) {
  aom_write(br, bit, probability);
 #if CONFIG_RD_DEBUG
  token_stats->cost += av1_cost_bit(probability, bit);
 #else
  (void)token_stats;
 #endif
 }
 static INLINE void aom_write_bit(aom_writer *w, int bit) {
  aom_write(w, bit, 128);  // aom_prob_half
 }
 static INLINE void aom_write_bit_record(aom_writer *w, int bit,
                                        TOKEN_STATS *token_stats) {
  aom_write_record(w, bit, 128, token_stats);  // aom_prob_half
 }
 static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
  int bit;
  for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
 }
 static INLINE void aom_write_tree_bits(aom_writer *w, const aom_tree_index *tr,
                                       const aom_prob *probs, int bits, int len,
                                       aom_tree_index i) {
  do {
    const int bit = (bits >> --len) & 1;
    aom_write(w, bit, probs[i >> 1]);
    i = tr[i + bit];
  } while (len);
 }
 static INLINE void aom_write_tree_bits_record(aom_writer *w,
                                              const aom_tree_index *tr,
                                              const aom_prob *probs, int bits,
                                              int len, aom_tree_index i,
                                              TOKEN_STATS *token_stats) {
  do {
    const int bit = (bits >> --len) & 1;
    aom_write_record(w, bit, probs[i >> 1], token_stats);
    i = tr[i + bit];
  } while (len);
 }
 static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
                                  const aom_prob *probs, int bits, int len,
                                  aom_tree_index i) {
 #if CONFIG_DAALA_EC
  daala_write_tree_bits(w, tree, probs, bits, len, i);
 #else
  aom_write_tree_bits(w, tree, probs, bits, len, i);
 #endif
 }
 static INLINE void aom_write_tree_record(aom_writer *w,
                                         const aom_tree_index *tree,
                                         const aom_prob *probs, int bits,
                                         int len, aom_tree_index i,
                                         TOKEN_STATS *token_stats) {
 #if CONFIG_DAALA_EC
  (void)token_stats;
  daala_write_tree_bits(w, tree, probs, bits, len, i);
 #else
  aom_write_tree_bits_record(w, tree, probs, bits, len, i, token_stats);
 #endif
 }
 #if CONFIG_EC_MULTISYMBOL
 static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
                                    int nsymbs) {
 #if CONFIG_RANS
  struct rans_sym s;
  (void)nsymbs;
  assert(cdf);
  s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
  s.prob = cdf[symb] - s.cum_prob;
  buf_rans_write(w, &s);
 #elif CONFIG_DAALA_EC
  daala_write_symbol(w, symb, cdf, nsymbs);
 #else
 #error \
    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
  "coder. Enable daala_ec or ans for a valid configuration."
 #endif
 #if CONFIG_EC_ADAPT
  update_cdf(cdf, symb, nsymbs);
 #endif
 }
 #endif  // CONFIG_EC_MULTISYMBOL
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_DSP_BITWRITER_H_
--- a/aom_dsp/bitwriter_buffer.c
+++ b/aom_dsp/bitwriter_buffer.c
@@ -0,0 +1,43 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #include <limits.h>
 #include <stdlib.h>
 #include "./aom_config.h"
 #include "./bitwriter_buffer.h"
 size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
 }
 void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
  const int off = (int)wb->bit_offset;
  const int p = off / CHAR_BIT;
  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
  if (q == CHAR_BIT - 1) {
    wb->bit_buffer[p] = bit << q;
  } else {
    wb->bit_buffer[p] &= ~(1 << q);
    wb->bit_buffer[p] |= bit << q;
  }
  wb->bit_offset = off + 1;
 }
 void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
  int bit;
  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
 }
 void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
                                     int bits) {
  aom_wb_write_literal(wb, data, bits + 1);
 }
--- a/aom_dsp/bitwriter_buffer.h
+++ b/aom_dsp/bitwriter_buffer.h
@@ -0,0 +1,39 @@
 /*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 #ifndef AOM_DSP_BITWRITER_BUFFER_H_
 #define AOM_DSP_BITWRITER_BUFFER_H_
 #include "aom/aom_integer.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct aom_write_bit_buffer {
  uint8_t *bit_buffer;
  size_t bit_offset;
 };
 size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
 void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
 void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
 void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
                                     int bits);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif  // AOM_DSP_BITWRITER_BUFFER_H_
--- a/Show More
+++ b/Show More