vpx_dsp/get_prob: relocate den == 0 test

to get_binary_prob(). the only other caller mode_mv_merge_probs() does its own test on 0. BUG=chromium:639712 Change-Id: I1178688706baeca2883f7aadbc254abb219a44ce (cherry picked from commit 93c823e24b)
vpx_dsp/get_prob: make clip_prob branchless
2016-10-04 15:18:58 -07:00 · 2016-10-04 15:18:58 -07:00 · 2016-08-22 15:46:57 +00:00 · 2016-08-22 15:46:20 +00:00 · 2016-08-19 23:11:05 +00:00 · 2016-08-19 09:30:32 -07:00
1239 changed files with 159749 additions and 186890 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -1,11 +1,10 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 3.8.1
+# Generated with clang-format 3.7.1
 AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
+AlignAfterOpenBracket: true
 AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
 AlignEscapedNewlinesLeft: true
 AlignOperands:   true
 AlignTrailingComments: true
@@ -16,23 +15,10 @@ AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: true
 BinPackArguments: true
 BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
 BreakBeforeTernaryOperators: true
@@ -47,13 +33,6 @@ DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
-IncludeCategories:
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
 IndentCaseLabels: true
 IndentWidth:     2
 IndentWrappedFunctionNames: false
@@ -72,8 +51,6 @@ PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
-ReflowComments:  true
-SortIncludes:    false
 SpaceAfterCStyleCast: false
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
--- a/.gitignore
+++ b/.gitignore
@@ -29,36 +29,37 @@
 /examples/decode_with_drops
 /examples/decode_with_partial_drops
 /examples/example_xma
-/examples/lossless_encoder
 /examples/postproc
 /examples/resize_util
 /examples/set_maps
 /examples/simple_decoder
 /examples/simple_encoder
 /examples/twopass_encoder
-/examples/aom_cx_set_ref
-/examples/av1_spatial_scalable_encoder
-/examples/aom_temporal_scalable_patterns
-/examples/aom_temporal_svc_encoder
+/examples/vp8_multi_resolution_encoder
+/examples/vp8cx_set_ref
+/examples/vp9_lossless_encoder
+/examples/vp9_spatial_scalable_encoder
+/examples/vpx_temporal_scalable_patterns
+/examples/vpx_temporal_svc_encoder
 /ivfdec
 /ivfdec.dox
 /ivfenc
 /ivfenc.dox
-/libaom.so*
-/libaom.ver
+/libvpx.so*
+/libvpx.ver
 /samples.dox
 /test_intra_pred_speed
-/test_libaom
-/aom_api1_migration.dox
-/av1_rtcd.h
-/aom.pc
-/aom_config.c
-/aom_config.h
-/aom_dsp_rtcd.h
-/aom_scale_rtcd.h
-/aom_version.h
-/aomdec
-/aomdec.dox
-/aomenc
-/aomenc.dox
+/test_libvpx
+/vp8_api1_migration.dox
+/vp[89x]_rtcd.h
+/vpx.pc
+/vpx_config.c
+/vpx_config.h
+/vpx_dsp_rtcd.h
+/vpx_scale_rtcd.h
+/vpx_version.h
+/vpxdec
+/vpxdec.dox
+/vpxenc
+/vpxenc.dox
 TAGS
--- a/.mailmap
+++ b/.mailmap
@@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
@@ -13,12 +14,15 @@ Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
+Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
@@ -26,7 +30,8 @@ Sami Pietilä <samipietila@google.com>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
-Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
+Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
--- a/18
+++ b/18
@@ -24,6 +24,7 @@ changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
 Christian Duvivier <cduvivier@google.com>
+Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
 Dim Temp <dimtemp0@gmail.com>
@@ -56,7 +57,7 @@ James Zern <jzern@google.com>
 Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
-Jean-Marc Valin <jmvalin@jmvalin.ca>
+Jean-Yves Avenard <jyavenard@mozilla.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
@@ -65,7 +66,6 @@ Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
 Joey Parrish <joeyparrish@google.com>
-Johann Koenig <johannkoenig@chromium.org>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
@@ -77,6 +77,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
 Lawrence Velázquez <larryv@macports.org>
+Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
@@ -92,7 +93,6 @@ Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
 Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
-Nathan E. Egge <negge@dgql.org>
 Nico Weber <thakis@chromium.org>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -101,7 +101,6 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter de Rivaz <peter.derivaz@argondesign.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
@@ -121,7 +120,6 @@ Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
-Steinar Midtskogen <stemidts@cisco.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
@@ -129,16 +127,16 @@ Tamar Levy <tamar.levy@intel.com>
 Tao Bai <michaelbai@chromium.org>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
-Thomas Daede <tdaede@mozilla.com>
-Thomas Davies <thdavies@cisco.com>
-Thomas <thdavies@cisco.com>
 Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
-Tristan Matthews <le.businessman@gmail.com>
-Tristan Matthews <tmatth@videolan.org>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
+Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
 Yunqing Wang <yunqingwang@google.com>
+Yury Gitman <yuryg@google.com>
 Zoe Liu <zoeliu@google.com>
+Google Inc.
+The Mozilla Foundation
+The Xiph.Org Foundation
--- a/34
+++ b/34
@@ -1,9 +1,33 @@
-Next Release
-  - Incompatible changes:
-    The AV1 encoder's default keyframe interval changed to 128 from 9999.
+2016-07-20 v1.6.0 "Khaki Campbell Duck"
+  This release improves upon the VP9 encoder and speeds up the encoding and
+  decoding processes.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum
+    in vpx_image and some minor changes to the VP8_COMP structure.
+
+    The default key frame interval for VP9 has changed from 128 to 9999.
+
+  - Enhancement:
+    A core focus has been performance for low end Intel processors. SSSE3
+    instructions such as 'pshufb' have been avoided and instructions have been
+    reordered to better accommodate the more constrained pipelines.
+
+    As a result, devices based on Celeron processors have seen substantial
+    decoding improvements. From Indian Runner Duck to Javan Whistling Duck,
+    decoding speed improved between 10 and 30%. Between Javan Whistling Duck
+    and Khaki Campbell Duck, it improved another 10 to 15%.
+
+    While Celeron benefited most, Core-i5 also improved 5% and 10% between the
+    respective releases.
+
+    Realtime performance for WebRTC for both speed and quality has received a
+    lot of attention.
+
+  - Bug Fixes:
+    A number of fuzzing issues, found variously by Mozilla, Chromium and others,
+    have been fixed and we strongly recommend updating.

-2016-04-07 v0.1.0 "AOMedia Codec 1"
-  This release is the first Alliance for Open Media codec.
 2015-11-09 v1.5.0 "Javan Whistling Duck"
  This release improves upon the VP9 encoder and speeds up the encoding and
  decoding processes.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,270 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-cmake_minimum_required(VERSION 3.2)
-project(AOM C CXX)
-
-set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
-set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
-
-set(AOM_SRCS
-    "${AOM_CONFIG_DIR}/aom_config.c"
-    "${AOM_CONFIG_DIR}/aom_config.h"
-    "${AOM_ROOT}/aom/aom.h"
-    "${AOM_ROOT}/aom/aom_codec.h"
-    "${AOM_ROOT}/aom/aom_decoder.h"
-    "${AOM_ROOT}/aom/aom_encoder.h"
-    "${AOM_ROOT}/aom/aom_frame_buffer.h"
-    "${AOM_ROOT}/aom/aom_image.h"
-    "${AOM_ROOT}/aom/aom_integer.h"
-    "${AOM_ROOT}/aom/aomcx.h"
-    "${AOM_ROOT}/aom/aomdx.h"
-    "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
-    "${AOM_ROOT}/aom/src/aom_codec.c"
-    "${AOM_ROOT}/aom/src/aom_decoder.c"
-    "${AOM_ROOT}/aom/src/aom_encoder.c"
-    "${AOM_ROOT}/aom/src/aom_image.c")
-
-set(AOM_DSP_SRCS
-    "${AOM_ROOT}/aom_dsp/aom_convolve.c"
-    "${AOM_ROOT}/aom_dsp/aom_convolve.h"
-    "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
-    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
-    "${AOM_ROOT}/aom_dsp/aom_filter.h"
-    "${AOM_ROOT}/aom_dsp/aom_simd.c"
-    "${AOM_ROOT}/aom_dsp/aom_simd.h"
-    "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
-    "${AOM_ROOT}/aom_dsp/avg.c"
-    "${AOM_ROOT}/aom_dsp/bitreader.h"
-    "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
-    "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
-    "${AOM_ROOT}/aom_dsp/bitwriter.h"
-    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
-    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
-    "${AOM_ROOT}/aom_dsp/blend.h"
-    "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
-    "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
-    "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
-    "${AOM_ROOT}/aom_dsp/dkboolreader.c"
-    "${AOM_ROOT}/aom_dsp/dkboolreader.h"
-    "${AOM_ROOT}/aom_dsp/dkboolwriter.c"
-    "${AOM_ROOT}/aom_dsp/dkboolwriter.h"
-    "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
-    "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
-    "${AOM_ROOT}/aom_dsp/intrapred.c"
-    "${AOM_ROOT}/aom_dsp/inv_txfm.c"
-    "${AOM_ROOT}/aom_dsp/inv_txfm.h"
-    "${AOM_ROOT}/aom_dsp/loopfilter.c"
-    "${AOM_ROOT}/aom_dsp/prob.c"
-    "${AOM_ROOT}/aom_dsp/prob.h"
-    "${AOM_ROOT}/aom_dsp/psnr.c"
-    "${AOM_ROOT}/aom_dsp/psnr.h"
-    "${AOM_ROOT}/aom_dsp/quantize.c"
-    "${AOM_ROOT}/aom_dsp/quantize.h"
-    "${AOM_ROOT}/aom_dsp/sad.c"
-    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
-    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
-    "${AOM_ROOT}/aom_dsp/subtract.c"
-    "${AOM_ROOT}/aom_dsp/txfm_common.h"
-    "${AOM_ROOT}/aom_dsp/variance.c"
-    "${AOM_ROOT}/aom_dsp/variance.h")
-
-set(AOM_MEM_SRCS
-    "${AOM_ROOT}/aom_mem/aom_mem.c"
-    "${AOM_ROOT}/aom_mem/aom_mem.h"
-    "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
-
-set(AOM_SCALE_SRCS
-    "${AOM_ROOT}/aom_scale/aom_scale.h"
-    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
-    "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
-    "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
-    "${AOM_ROOT}/aom_scale/generic/yv12config.c"
-    "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
-    "${AOM_ROOT}/aom_scale/yv12config.h")
-
-# TODO(tomfinegan): Extract aom_ports from aom_util if possible.
-set(AOM_UTIL_SRCS
-    "${AOM_ROOT}/aom_ports/aom_once.h"
-    "${AOM_ROOT}/aom_ports/aom_timer.h"
-    "${AOM_ROOT}/aom_ports/bitops.h"
-    "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
-    "${AOM_ROOT}/aom_ports/mem.h"
-    "${AOM_ROOT}/aom_ports/mem_ops.h"
-    "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
-    "${AOM_ROOT}/aom_ports/msvc.h"
-    "${AOM_ROOT}/aom_ports/system_state.h"
-    "${AOM_ROOT}/aom_util/aom_thread.c"
-    "${AOM_ROOT}/aom_util/aom_thread.h"
-    "${AOM_ROOT}/aom_util/endian_inl.h")
-
-set(AOM_AV1_COMMON_SRCS
-    "${AOM_ROOT}/av1/av1_iface_common.h"
-    "${AOM_ROOT}/av1/common/alloccommon.c"
-    "${AOM_ROOT}/av1/common/alloccommon.h"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm.c"
-    "${AOM_ROOT}/av1/common/av1_fwd_txfm.h"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm.c"
-    "${AOM_ROOT}/av1/common/av1_inv_txfm.h"
-    "${AOM_ROOT}/av1/common/av1_rtcd.c"
-    "${AOM_ROOT}/av1/common/blockd.c"
-    "${AOM_ROOT}/av1/common/blockd.h"
-    "${AOM_ROOT}/av1/common/common.h"
-    "${AOM_ROOT}/av1/common/common_data.h"
-    "${AOM_ROOT}/av1/common/convolve.c"
-    "${AOM_ROOT}/av1/common/convolve.h"
-    "${AOM_ROOT}/av1/common/debugmodes.c"
-    "${AOM_ROOT}/av1/common/entropy.c"
-    "${AOM_ROOT}/av1/common/entropy.h"
-    "${AOM_ROOT}/av1/common/entropymode.c"
-    "${AOM_ROOT}/av1/common/entropymode.h"
-    "${AOM_ROOT}/av1/common/entropymv.c"
-    "${AOM_ROOT}/av1/common/entropymv.h"
-    "${AOM_ROOT}/av1/common/enums.h"
-    "${AOM_ROOT}/av1/common/filter.c"
-    "${AOM_ROOT}/av1/common/filter.h"
-    "${AOM_ROOT}/av1/common/frame_buffers.c"
-    "${AOM_ROOT}/av1/common/frame_buffers.h"
-    "${AOM_ROOT}/av1/common/idct.c"
-    "${AOM_ROOT}/av1/common/idct.h"
-    "${AOM_ROOT}/av1/common/loopfilter.c"
-    "${AOM_ROOT}/av1/common/loopfilter.h"
-    "${AOM_ROOT}/av1/common/mv.h"
-    "${AOM_ROOT}/av1/common/mvref_common.c"
-    "${AOM_ROOT}/av1/common/mvref_common.h"
-    "${AOM_ROOT}/av1/common/odintrin.c"
-    "${AOM_ROOT}/av1/common/odintrin.h"
-    "${AOM_ROOT}/av1/common/onyxc_int.h"
-    "${AOM_ROOT}/av1/common/pred_common.c"
-    "${AOM_ROOT}/av1/common/pred_common.h"
-    "${AOM_ROOT}/av1/common/quant_common.c"
-    "${AOM_ROOT}/av1/common/quant_common.h"
-    "${AOM_ROOT}/av1/common/reconinter.c"
-    "${AOM_ROOT}/av1/common/reconinter.h"
-    "${AOM_ROOT}/av1/common/reconintra.c"
-    "${AOM_ROOT}/av1/common/reconintra.h"
-    "${AOM_ROOT}/av1/common/scale.c"
-    "${AOM_ROOT}/av1/common/scale.h"
-    "${AOM_ROOT}/av1/common/scan.c"
-    "${AOM_ROOT}/av1/common/scan.h"
-    "${AOM_ROOT}/av1/common/seg_common.c"
-    "${AOM_ROOT}/av1/common/seg_common.h"
-    "${AOM_ROOT}/av1/common/thread_common.c"
-    "${AOM_ROOT}/av1/common/thread_common.h"
-    "${AOM_ROOT}/av1/common/tile_common.c"
-    "${AOM_ROOT}/av1/common/tile_common.h")
-
-set(AOM_AV1_DECODER_SRCS
-    "${AOM_ROOT}/av1/av1_dx_iface.c"
-    "${AOM_ROOT}/av1/decoder/decodeframe.c"
-    "${AOM_ROOT}/av1/decoder/decodeframe.h"
-    "${AOM_ROOT}/av1/decoder/decodemv.c"
-    "${AOM_ROOT}/av1/decoder/decodemv.h"
-    "${AOM_ROOT}/av1/decoder/decoder.c"
-    "${AOM_ROOT}/av1/decoder/decoder.h"
-    "${AOM_ROOT}/av1/decoder/detokenize.c"
-    "${AOM_ROOT}/av1/decoder/detokenize.h"
-    "${AOM_ROOT}/av1/decoder/dsubexp.c"
-    "${AOM_ROOT}/av1/decoder/dsubexp.h"
-    "${AOM_ROOT}/av1/decoder/dthread.c"
-    "${AOM_ROOT}/av1/decoder/dthread.h")
-
-set(AOM_AV1_ENCODER_SRCS
-    "${AOM_ROOT}/av1/av1_cx_iface.c"
-    "${AOM_ROOT}/av1/encoder/aq_complexity.c"
-    "${AOM_ROOT}/av1/encoder/aq_complexity.h"
-    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
-    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
-    "${AOM_ROOT}/av1/encoder/aq_variance.c"
-    "${AOM_ROOT}/av1/encoder/aq_variance.h"
-    "${AOM_ROOT}/av1/encoder/bitstream.c"
-    "${AOM_ROOT}/av1/encoder/bitstream.h"
-    "${AOM_ROOT}/av1/encoder/block.h"
-    "${AOM_ROOT}/av1/encoder/context_tree.c"
-    "${AOM_ROOT}/av1/encoder/context_tree.h"
-    "${AOM_ROOT}/av1/encoder/cost.c"
-    "${AOM_ROOT}/av1/encoder/cost.h"
-    "${AOM_ROOT}/av1/encoder/dct.c"
-    "${AOM_ROOT}/av1/encoder/encodeframe.c"
-    "${AOM_ROOT}/av1/encoder/encodeframe.h"
-    "${AOM_ROOT}/av1/encoder/encodemb.c"
-    "${AOM_ROOT}/av1/encoder/encodemb.h"
-    "${AOM_ROOT}/av1/encoder/encodemv.c"
-    "${AOM_ROOT}/av1/encoder/encodemv.h"
-    "${AOM_ROOT}/av1/encoder/encoder.c"
-    "${AOM_ROOT}/av1/encoder/encoder.h"
-    "${AOM_ROOT}/av1/encoder/ethread.c"
-    "${AOM_ROOT}/av1/encoder/ethread.h"
-    "${AOM_ROOT}/av1/encoder/extend.c"
-    "${AOM_ROOT}/av1/encoder/extend.h"
-    "${AOM_ROOT}/av1/encoder/firstpass.c"
-    "${AOM_ROOT}/av1/encoder/firstpass.h"
-    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
-    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
-    "${AOM_ROOT}/av1/encoder/lookahead.c"
-    "${AOM_ROOT}/av1/encoder/lookahead.h"
-    "${AOM_ROOT}/av1/encoder/mbgraph.c"
-    "${AOM_ROOT}/av1/encoder/mbgraph.h"
-    "${AOM_ROOT}/av1/encoder/mcomp.c"
-    "${AOM_ROOT}/av1/encoder/mcomp.h"
-    "${AOM_ROOT}/av1/encoder/picklpf.c"
-    "${AOM_ROOT}/av1/encoder/picklpf.h"
-    "${AOM_ROOT}/av1/encoder/quantize.c"
-    "${AOM_ROOT}/av1/encoder/quantize.h"
-    "${AOM_ROOT}/av1/encoder/ratectrl.c"
-    "${AOM_ROOT}/av1/encoder/ratectrl.h"
-    "${AOM_ROOT}/av1/encoder/rd.c"
-    "${AOM_ROOT}/av1/encoder/rd.h"
-    "${AOM_ROOT}/av1/encoder/rdopt.c"
-    "${AOM_ROOT}/av1/encoder/rdopt.h"
-    "${AOM_ROOT}/av1/encoder/resize.c"
-    "${AOM_ROOT}/av1/encoder/resize.h"
-    "${AOM_ROOT}/av1/encoder/segmentation.c"
-    "${AOM_ROOT}/av1/encoder/segmentation.h"
-    "${AOM_ROOT}/av1/encoder/speed_features.c"
-    "${AOM_ROOT}/av1/encoder/speed_features.h"
-    "${AOM_ROOT}/av1/encoder/subexp.c"
-    "${AOM_ROOT}/av1/encoder/subexp.h"
-    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
-    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
-    "${AOM_ROOT}/av1/encoder/tokenize.c"
-    "${AOM_ROOT}/av1/encoder/tokenize.h"
-    "${AOM_ROOT}/av1/encoder/treewriter.c"
-    "${AOM_ROOT}/av1/encoder/treewriter.h")
-
-# Targets
-add_library(aom_dsp ${AOM_DSP_SRCS})
-include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
-add_library(aom_mem ${AOM_MEM_SRCS})
-add_library(aom_scale ${AOM_SCALE_SRCS})
-include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
-add_library(aom_util ${AOM_UTIL_SRCS})
-add_library(aom_av1_decoder ${AOM_AV1_DECODER_SRCS})
-add_library(aom_av1_encoder ${AOM_AV1_ENCODER_SRCS})
-add_library(aom ${AOM_SRCS})
-target_link_libraries(aom LINK_PUBLIC
-                      aom_dsp
-                      aom_mem
-                      aom_scale
-                      aom_util
-                      aom_av1_decoder
-                      aom_av1_encoder)
-add_executable(simple_decoder examples/simple_decoder.c)
-include_directories(${AOM_ROOT})
-target_link_libraries(simple_decoder LINK_PUBLIC aom)
-add_executable(simple_encoder examples/simple_encoder.c)
-include_directories(${AOM_ROOT})
-target_link_libraries(simple_encoder LINK_PUBLIC aom)
-
--- a/42
+++ b/42
@@ -1,27 +1,31 @@
-Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+Copyright (c) 2010, The WebM Project authors. All rights reserved.

 Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
+modification, are permitted provided that the following conditions are
+met:

-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.

-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in
-   the documentation and/or other materials provided with the
-   distribution.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google, nor the WebM Project, nor the names
+    of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written
+    permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

--- a/127
+++ b/127
@@ -1,108 +1,23 @@
-Alliance for Open Media Patent License 1.0
+Additional IP Rights Grant (Patents)
+------------------------------------

-1. License Terms.
-
-1.1. Patent License. Subject to the terms and conditions of this License, each
-     Licensor, on behalf of itself and successors in interest and assigns,
-     grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
-     no-charge, royalty-free, irrevocable (except as expressly stated in this
-     License) patent license to its Necessary Claims to make, use, sell, offer
-     for sale, import or distribute any Implementation.
-
-1.2. Conditions.
-
-1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
-       sell, offer for sale, import or distribute an Implementation under
-       Section 1.1, Licensee must make its Necessary Claims available under
-       this License, and must reproduce this License with any Implementation
-       as follows:
-
-       a. For distribution in source code, by including this License in the
-          root directory of the source code with its Implementation.
-
-       b. For distribution in any other form (including binary, object form,
-          and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
-          GDSII, etc.)), by including this License in the documentation, legal
-          notices, and/or other written materials provided with the
-          Implementation.
-
-1.2.2. Additional Conditions. This license is directly from Licensor to
-       Licensee.  Licensee acknowledges as a condition of benefiting from it
-       that no rights from Licensor are received from suppliers, distributors,
-       or otherwise in connection with this License.
-
-1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
-     initiates patent litigation or files, maintains, or voluntarily
-     participates in a lawsuit against another entity or any person asserting
-     that any Implementation infringes Necessary Claims, any patent licenses
-     granted under this License directly to the Licensee are immediately
-     terminated as of the date of the initiation of action unless 1) that suit
-     was in response to a corresponding suit regarding an Implementation first
-     brought against an initiating entity, or 2) that suit was brought to
-     enforce the terms of this License (including intervention in a third-party
-     action by a Licensee).
-
-1.4. Disclaimers. The Reference Implementation and Specification are provided
-     "AS IS" and without warranty. The entire risk as to implementing or
-     otherwise using the Reference Implementation or Specification is assumed
-     by the implementer and user. Licensor expressly disclaims any warranties
-     (express, implied, or otherwise), including implied warranties of
-     merchantability, non-infringement, fitness for a particular purpose, or
-     title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
-     ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
-     INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
-     ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
-     OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
-     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-2. Definitions.
-
-2.1. Affiliate.  <20>Affiliate<74> means an entity that directly or indirectly
-     Controls, is Controlled by, or is under common Control of that party.
-
-2.2. Control. <20>Control<6F> means direct or indirect control of more than 50% of
-     the voting power to elect directors of that corporation, or for any other
-     entity, the power to direct management of such entity.
-
-2.3. Decoder.  "Decoder" means any decoder that conforms fully with all
-     non-optional portions of the Specification.
-
-2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
-     be decoded by a Decoder only to the extent it produces such a bitstream.
-
-2.5. Final Deliverable.  <20>Final Deliverable<6C> means the final version of a
-     deliverable approved by the Alliance for Open Media as a Final
-     Deliverable.
-
-2.6. Implementation.  "Implementation" means any implementation, including the
-     Reference Implementation, that is an Encoder and/or a Decoder. An
-     Implementation also includes components of an Implementation only to the
-     extent they are used as part of an Implementation.
-
-2.7. License. <20>License<73> means this license.
-
-2.8. Licensee. <20>Licensee<65> means any person or entity who exercises patent
-     rights granted under this License.
-
-2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
-     for sale, imports or distributes any Implementation, or (ii) a person
-     or entity that has a licensing obligation to the Implementation as a
-     result of its membership and/or participation in the Alliance for Open
-     Media working group that developed the Specification.
-
-2.10. Necessary Claims.  "Necessary Claims" means all claims of patents or
-      patent applications, (a) that currently or at any time in the future,
-      are owned or controlled by the Licensor, and (b) (i) would be an
-      Essential Claim as defined by the W3C Policy as of February 5, 2004
-      (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
-      as if the Specification was a W3C Recommendation; or (ii) are infringed
-      by the Reference Implementation.
-
-2.11. Reference Implementation. <20>Reference Implementation<6F> means an Encoder
-      and/or Decoder released by the Alliance for Open Media as a Final
-      Deliverable.
-
-2.12. Specification. <20>Specification<6F> means the specification designated by
-      the Alliance for Open Media as a Final Deliverable for which this
-      License was issued.
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.

+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
--- a/25
+++ b/25
@@ -1,6 +1,6 @@
-README - 23 March 2015
+README - 20 July 2016

-Welcome to the WebM VP8/AV1 Codec SDK!
+Welcome to the WebM VP8/VP9 Codec SDK!

 COMPILING THE APPLICATIONS/LIBRARIES:
  The build system used is similar to autotools. Building generally consists of
@@ -33,13 +33,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:

    $ mkdir build
    $ cd build
-    $ ../libaom/configure <options>
+    $ ../libvpx/configure <options>
    $ make

  3. Configuration options
  The 'configure' script supports a number of options. The --help option can be
  used to get a list of supported options:
-    $ ../libaom/configure --help
+    $ ../libvpx/configure --help

  4. Cross development
  For cross development, the most notable option is the --target option. The
@@ -47,10 +47,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  --help output of the configure script. As of this writing, the list of
  available targets is:

-    armv6-linux-rvct
-    armv6-linux-gcc
-    armv6-none-rvct
    arm64-darwin-gcc
+    arm64-linux-gcc
    armv7-android-gcc
    armv7-darwin-gcc
    armv7-linux-rvct
@@ -60,6 +58,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    armv7-win32-vs12
    armv7-win32-vs14
    armv7s-darwin-gcc
+    armv8-linux-gcc
    mips32-linux-gcc
    mips64-linux-gcc
    sparc-solaris-gcc
@@ -73,6 +72,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86-darwin12-gcc
    x86-darwin13-gcc
    x86-darwin14-gcc
+    x86-darwin15-gcc
    x86-iphonesimulator-gcc
    x86-linux-gcc
    x86-linux-icc
@@ -90,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
    x86_64-darwin12-gcc
    x86_64-darwin13-gcc
    x86_64-darwin14-gcc
+    x86_64-darwin15-gcc
    x86_64-iphonesimulator-gcc
    x86_64-linux-gcc
    x86_64-linux-icc
@@ -108,7 +109,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  toolchain, the following command could be used (note, POSIX SH syntax, adapt
  to your shell as necessary):

-    $ CROSS=mipsel-linux-uclibc- ../libaom/configure
+    $ CROSS=mipsel-linux-uclibc- ../libvpx/configure

  In addition, the executables to be invoked can be overridden by specifying the
  environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
@@ -119,13 +120,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
  This defaults to config.log. This should give a good indication of what went
  wrong. If not, contact us for support.

-VP8/AV1 TEST VECTORS:
+VP8/VP9 TEST VECTORS:
  The test vectors can be downloaded and verified using the build system after
  running configure. To specify an alternate directory the
-  LIBAOM_TEST_DATA_PATH environment variable can be used.
+  LIBVPX_TEST_DATA_PATH environment variable can be used.

  $ ./configure --enable-unit-tests
-  $ LIBAOM_TEST_DATA_PATH=../-test-data make testdata
+  $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata

 CODE STYLE:
  The coding style used by this project is enforced with clang-format using the
@@ -144,5 +145,5 @@ CODE STYLE:

 SUPPORT
  This library is an open source project supported by its community. Please
-  please email webm-discuss@webmproject.org for help.
+  email webm-discuss@webmproject.org for help.

--- a/aom/aom.h
+++ b/aom/aom.h
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\defgroup aom AOM
- * \ingroup codecs
- * AOM is aom's newest video compression algorithm that uses motion
- * compensated prediction, Discrete Cosine Transform (DCT) coding of the
- * prediction error signal and context dependent entropy coding techniques
- * based on arithmetic principles. It features:
- *  - YUV 4:2:0 image format
- *  - Macro-block based coding (16x16 luma plus two 8x8 chroma)
- *  - 1/4 (1/8) pixel accuracy motion compensated prediction
- *  - 4x4 DCT transform
- *  - 128 level linear quantizer
- *  - In loop deblocking filter
- *  - Context-based entropy coding
- *
- * @{
- */
-/*!\file
- * \brief Provides controls common to both the AOM encoder and decoder.
- */
-#ifndef AOM_AOM_H_
-#define AOM_AOM_H_
-
-#include "./aom_codec.h"
-#include "./aom_image.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!\brief Control functions
- *
- * The set of macros define the control functions of AOM interface
- */
-enum aom_com_control_id {
-  /*!\brief pass in an external frame into decoder to be used as reference frame
-   */
-  AOM_SET_REFERENCE = 1,
-  AOM_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
-  AOM_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
-  AOM_SET_DBG_COLOR_REF_FRAME =
-      4, /**< set the reference frames to color for each macroblock */
-  AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
-  AOM_SET_DBG_COLOR_B_MODES = 6,  /**< set which blocks modes to color */
-  AOM_SET_DBG_DISPLAY_MV = 7,     /**< set which motion vector modes to draw */
-
-  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
-   * for its control ids. These should be migrated to something like the
-   * AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
-   */
-  AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
-  AOM_COMMON_CTRL_ID_MAX,
-
-  AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
-
-  AOM_DECODER_CTRL_ID_START = 256
-};
-
-/*!\brief post process flags
- *
- * The set of macros define AOM decoder post processing flags
- */
-enum aom_postproc_level {
-  AOM_NOFILTERING = 0,
-  AOM_DEBLOCK = 1 << 0,
-  AOM_DEMACROBLOCK = 1 << 1,
-  AOM_ADDNOISE = 1 << 2,
-  AOM_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
-  AOM_DEBUG_TXT_MBLK_MODES =
-      1 << 4, /**< print macro block modes over each macro block */
-  AOM_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
-  AOM_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
-  AOM_MFQE = 1 << 10
-};
-
-/*!\brief post process flags
- *
- * This define a structure that describe the post processing settings. For
- * the best objective measure (using the PSNR metric) set post_proc_flag
- * to AOM_DEBLOCK and deblocking_level to 1.
- */
-
-typedef struct aom_postproc_cfg {
-  /*!\brief the types of post processing to be done, should be combination of
-   * "aom_postproc_level" */
-  int post_proc_flag;
-  int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
-  int noise_level; /**< the strength of additive noise, valid range [0, 16] */
-} aom_postproc_cfg_t;
-
-/*!\brief reference frame type
- *
- * The set of macros define the type of AOM reference frames
- */
-typedef enum aom_ref_frame_type {
-  AOM_LAST_FRAME = 1,
-  AOM_GOLD_FRAME = 2,
-  AOM_ALTR_FRAME = 4
-} aom_ref_frame_type_t;
-
-/*!\brief reference frame data struct
- *
- * Define the data struct to access aom reference frames.
- */
-typedef struct aom_ref_frame {
-  aom_ref_frame_type_t frame_type; /**< which reference frame */
-  aom_image_t img;                 /**< reference frame data in image format */
-} aom_ref_frame_t;
-
-/*!\brief AV1 specific reference frame data struct
- *
- * Define the data struct to access av1 reference frames.
- */
-typedef struct av1_ref_frame {
-  int idx;         /**< frame index to get (input) */
-  aom_image_t img; /**< img structure to populate (output) */
-} av1_ref_frame_t;
-
-/*!\cond */
-/*!\brief aom decoder control function parameter type
- *
- * defines the data type for each of AOM decoder control function requires
- */
-AOM_CTRL_USE_TYPE(AOM_SET_REFERENCE, aom_ref_frame_t *)
-#define AOM_CTRL_AOM_SET_REFERENCE
-AOM_CTRL_USE_TYPE(AOM_COPY_REFERENCE, aom_ref_frame_t *)
-#define AOM_CTRL_AOM_COPY_REFERENCE
-AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
-#define AOM_CTRL_AOM_SET_POSTPROC
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
-#define AOM_CTRL_AOM_SET_DBG_COLOR_REF_FRAME
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_MB_MODES, int)
-#define AOM_CTRL_AOM_SET_DBG_COLOR_MB_MODES
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_B_MODES, int)
-#define AOM_CTRL_AOM_SET_DBG_COLOR_B_MODES
-AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
-#define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
-AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
-#define AOM_CTRL_AV1_GET_REFERENCE
-AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
-#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
-
-/*!\endcond */
-/*! @} - end defgroup aom */
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_H_
--- a/aom/aom_codec.mk
+++ b/aom/aom_codec.mk
@@ -1,42 +0,0 @@
-##
-## Copyright (c) 2016, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-
-
-API_EXPORTS += exports
-
-API_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
-API_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
-API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
-API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
-
-API_SRCS-$(CONFIG_AV1_DECODER) += aom.h
-API_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
-API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aom.h
-API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
-
-API_DOC_SRCS-yes += aom_codec.h
-API_DOC_SRCS-yes += aom_decoder.h
-API_DOC_SRCS-yes += aom_encoder.h
-API_DOC_SRCS-yes += aom_frame_buffer.h
-API_DOC_SRCS-yes += aom_image.h
-
-API_SRCS-yes += src/aom_decoder.c
-API_SRCS-yes += aom_decoder.h
-API_SRCS-yes += src/aom_encoder.c
-API_SRCS-yes += aom_encoder.h
-API_SRCS-yes += internal/aom_codec_internal.h
-API_SRCS-yes += src/aom_codec.c
-API_SRCS-yes += src/aom_image.c
-API_SRCS-yes += aom_codec.h
-API_SRCS-yes += aom_codec.mk
-API_SRCS-yes += aom_frame_buffer.h
-API_SRCS-yes += aom_image.h
-API_SRCS-yes += aom_integer.h
--- a/aom/aomcx.h
+++ b/aom/aomcx.h
@@ -1,759 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_AOMCX_H_
-#define AOM_AOMCX_H_
-
-/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
- * \ingroup aom
- *
- * @{
- */
-#include "./aom.h"
-#include "./aom_encoder.h"
-
-/*!\file
- * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
- *        aom Codec Interface.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!\name Algorithm interface for AV1
- *
- * This interface provides the capability to encode raw AV1 streams.
- * @{
- */
-extern aom_codec_iface_t aom_codec_av1_cx_algo;
-extern aom_codec_iface_t *aom_codec_av1_cx(void);
-/*!@} - end algorithm interface member group*/
-
-/*
- * Algorithm Flags
- */
-
-/*!\brief Don't reference the last frame
- *
- * When this flag is set, the encoder will not use the last frame as a
- * predictor. When not set, the encoder will choose whether to use the
- * last frame or not automatically.
- */
-#define AOM_EFLAG_NO_REF_LAST (1 << 16)
-
-/*!\brief Don't reference the golden frame
- *
- * When this flag is set, the encoder will not use the golden frame as a
- * predictor. When not set, the encoder will choose whether to use the
- * golden frame or not automatically.
- */
-#define AOM_EFLAG_NO_REF_GF (1 << 17)
-
-/*!\brief Don't reference the alternate reference frame
- *
- * When this flag is set, the encoder will not use the alt ref frame as a
- * predictor. When not set, the encoder will choose whether to use the
- * alt ref frame or not automatically.
- */
-#define AOM_EFLAG_NO_REF_ARF (1 << 21)
-
-/*!\brief Don't update the last frame
- *
- * When this flag is set, the encoder will not update the last frame with
- * the contents of the current frame.
- */
-#define AOM_EFLAG_NO_UPD_LAST (1 << 18)
-
-/*!\brief Don't update the golden frame
- *
- * When this flag is set, the encoder will not update the golden frame with
- * the contents of the current frame.
- */
-#define AOM_EFLAG_NO_UPD_GF (1 << 22)
-
-/*!\brief Don't update the alternate reference frame
- *
- * When this flag is set, the encoder will not update the alt ref frame with
- * the contents of the current frame.
- */
-#define AOM_EFLAG_NO_UPD_ARF (1 << 23)
-
-/*!\brief Force golden frame update
- *
- * When this flag is set, the encoder copy the contents of the current frame
- * to the golden frame buffer.
- */
-#define AOM_EFLAG_FORCE_GF (1 << 19)
-
-/*!\brief Force alternate reference frame update
- *
- * When this flag is set, the encoder copy the contents of the current frame
- * to the alternate reference frame buffer.
- */
-#define AOM_EFLAG_FORCE_ARF (1 << 24)
-
-/*!\brief Disable entropy update
- *
- * When this flag is set, the encoder will not update its internal entropy
- * model based on the entropy of this frame.
- */
-#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 20)
-
-/*!\brief AVx encoder control functions
- *
- * This set of macros define the control functions available for AVx
- * encoder interface.
- *
- * \sa #aom_codec_control
- */
-enum aome_enc_control_id {
-  /*!\brief Codec control function to set which reference frame encoder can use.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_USE_REFERENCE = 7,
-
-  /*!\brief Codec control function to pass an ROI map to encoder.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_ROI_MAP = 8,
-
-  /*!\brief Codec control function to pass an Active map to encoder.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_ACTIVEMAP,
-
-  /*!\brief Codec control function to set encoder scaling mode.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_SCALEMODE = 11,
-
-  /*!\brief Codec control function to set encoder internal speed settings.
-   *
-   * Changes in this value influences, among others, the encoder's selection
-   * of motion estimation methods. Values greater than 0 will increase encoder
-   * speed at the expense of quality.
-   *
-   * \note Valid range for VP8: -16..16
-   * \note Valid range for AV1: -8..8
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_CPUUSED = 13,
-
-  /*!\brief Codec control function to enable automatic set and use alf frames.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_ENABLEAUTOALTREF,
-
-#if CONFIG_EXT_REFS
-  /*!\brief Codec control function to enable automatic set and use
-   * bwd-pred frames.
-   *
-   * Supported in codecs: AV1
-   */
-  AOME_SET_ENABLEAUTOBWDREF,
-#endif  // CONFIG_EXT_REFS
-
-  /*!\brief control function to set noise sensitivity
-   *
-   * 0: off, 1: OnYOnly, 2: OnYUV,
-   * 3: OnYUVAggressive, 4: Adaptive
-   *
-   * Supported in codecs: VP8
-   */
-  AOME_SET_NOISE_SENSITIVITY,
-
-  /*!\brief Codec control function to set sharpness.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_SHARPNESS,
-
-  /*!\brief Codec control function to set the threshold for MBs treated static.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_STATIC_THRESHOLD,
-
-  /*!\brief Codec control function to set the number of token partitions.
-   *
-   * Supported in codecs: VP8
-   */
-  AOME_SET_TOKEN_PARTITIONS,
-
-  /*!\brief Codec control function to get last quantizer chosen by the encoder.
-   *
-   * Return value uses internal quantizer scale defined by the codec.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_GET_LAST_QUANTIZER,
-
-  /*!\brief Codec control function to get last quantizer chosen by the encoder.
-   *
-   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
-   * parameters.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_GET_LAST_QUANTIZER_64,
-
-  /*!\brief Codec control function to set the max no of frames to create arf.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_ARNR_MAXFRAMES,
-
-  /*!\brief Codec control function to set the filter strength for the arf.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_ARNR_STRENGTH,
-
-  /*!\deprecated control function to set the filter type to use for the arf. */
-  AOME_SET_ARNR_TYPE,
-
-  /*!\brief Codec control function to set visual tuning.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_TUNING,
-
-  /*!\brief Codec control function to set constrained quality level.
-   *
-   * \attention For this value to be used aom_codec_enc_cfg_t::g_usage must be
-   *            set to #AOM_CQ.
-   * \note Valid range: 0..63
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_CQ_LEVEL,
-
-  /*!\brief Codec control function to set Max data rate for Intra frames.
-   *
-   * This value controls additional clamping on the maximum size of a
-   * keyframe. It is expressed as a percentage of the average
-   * per-frame bitrate, with the special (and default) value 0 meaning
-   * unlimited, or no additional clamping beyond the codec's built-in
-   * algorithm.
-   *
-   * For example, to allocate no more than 4.5 frames worth of bitrate
-   * to a keyframe, set this to 450.
-   *
-   * Supported in codecs: VP8, AV1
-   */
-  AOME_SET_MAX_INTRA_BITRATE_PCT,
-
-  /*!\brief Codec control function to set reference and update frame flags.
-   *
-   *  Supported in codecs: VP8
-   */
-  AOME_SET_FRAME_FLAGS,
-
-  /*!\brief Codec control function to set max data rate for Inter frames.
-   *
-   * This value controls additional clamping on the maximum size of an
-   * inter frame. It is expressed as a percentage of the average
-   * per-frame bitrate, with the special (and default) value 0 meaning
-   * unlimited, or no additional clamping beyond the codec's built-in
-   * algorithm.
-   *
-   * For example, to allow no more than 4.5 frames worth of bitrate
-   * to an inter frame, set this to 450.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_MAX_INTER_BITRATE_PCT,
-
-  /*!\brief Boost percentage for Golden Frame in CBR mode.
-   *
-   * This value controls the amount of boost given to Golden Frame in
-   * CBR mode. It is expressed as a percentage of the average
-   * per-frame bitrate, with the special (and default) value 0 meaning
-   * the feature is off, i.e., no golden frame boost in CBR mode and
-   * average bitrate target is used.
-   *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
-   * than average frame, set this to 100.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_GF_CBR_BOOST_PCT,
-
-  /*!\brief Codec control function to set encoder screen content mode.
-   *
-   * 0: off, 1: On, 2: On with more aggressive rate control.
-   *
-   * Supported in codecs: VP8
-   */
-  AOME_SET_SCREEN_CONTENT_MODE,
-
-  /*!\brief Codec control function to set lossless encoding mode.
-   *
-   * AV1 can operate in lossless encoding mode, in which the bitstream
-   * produced will be able to decode and reconstruct a perfect copy of
-   * input source. This control function provides a mean to switch encoder
-   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
-   *                          0 = lossy coding mode
-   *                          1 = lossless coding mode
-   *
-   *  By default, encoder operates in normal coding mode (maybe lossy).
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_LOSSLESS,
-#if CONFIG_AOM_QM
-  /*!\brief Codec control function to encode with quantisation matrices.
-   *
-   * AOM can operate with default quantisation matrices dependent on
-   * quantisation level and block type.
-   *                          0 = do not use quantisation matrices
-   *                          1 = use quantisation matrices
-   *
-   *  By default, the encoder operates without quantisation matrices.
-   *
-   * Supported in codecs: AOM
-   */
-
-  AV1E_SET_ENABLE_QM,
-
-  /*!\brief Codec control function to set the min quant matrix flatness.
-   *
-   * AOM can operate with different ranges of quantisation matrices.
-   * As quantisation levels increase, the matrices get flatter. This
-   * control sets the minimum level of flatness from which the matrices
-   * are determined.
-   *
-   *  By default, the encoder sets this minimum at half the available
-   *  range.
-   *
-   * Supported in codecs: AOM
-   */
-  AV1E_SET_QM_MIN,
-
-  /*!\brief Codec control function to set the max quant matrix flatness.
-   *
-   * AOM can operate with different ranges of quantisation matrices.
-   * As quantisation levels increase, the matrices get flatter. This
-   * control sets the maximum level of flatness possible.
-   *
-   * By default, the encoder sets this maximum at the top of the
-   * available range.
-   *
-   * Supported in codecs: AOM
-   */
-  AV1E_SET_QM_MAX,
-#endif
-
-  /*!\brief Codec control function to set number of tile columns.
-   *
-   * In encoding and decoding, AV1 allows an input image frame be partitioned
-   * into separated vertical tile columns, which can be encoded or decoded
-   * independently. This enables easy implementation of parallel encoding and
-   * decoding. This control requests the encoder to use column tiles in
-   * encoding an input frame, with number of tile columns (in Log2 unit) as
-   * the parameter:
-   *             0 = 1 tile column
-   *             1 = 2 tile columns
-   *             2 = 4 tile columns
-   *             .....
-   *             n = 2**n tile columns
-   * The requested tile columns will be capped by encoder based on image size
-   * limitation (The minimum width of a tile column is 256 pixel, the maximum
-   * is 4096).
-   *
-   * By default, the value is 0, i.e. one single column tile for entire image.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_TILE_COLUMNS,
-
-  /*!\brief Codec control function to set number of tile rows.
-   *
-   * In encoding and decoding, AV1 allows an input image frame be partitioned
-   * into separated horizontal tile rows. Tile rows are encoded or decoded
-   * sequentially. Even though encoding/decoding of later tile rows depends on
-   * earlier ones, this allows the encoder to output data packets for tile rows
-   * prior to completely processing all tile rows in a frame, thereby reducing
-   * the latency in processing between input and output. The parameter
-   * for this control describes the number of tile rows, which has a valid
-   * range [0, 2]:
-   *            0 = 1 tile row
-   *            1 = 2 tile rows
-   *            2 = 4 tile rows
-   *
-   * By default, the value is 0, i.e. one single row tile for entire image.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_TILE_ROWS,
-
-  /*!\brief Codec control function to enable frame parallel decoding feature.
-   *
-   * AV1 has a bitstream feature to reduce decoding dependency between frames
-   * by turning off backward update of probability context used in encoding
-   * and decoding. This allows staged parallel processing of more than one
-   * video frames in the decoder. This control function provides a mean to
-   * turn this feature on or off for bitstreams produced by encoder.
-   *
-   * By default, this feature is off.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_FRAME_PARALLEL_DECODING,
-
-  /*!\brief Codec control function to set adaptive quantization mode.
-   *
-   * AV1 has a segment based feature that allows encoder to adaptively change
-   * quantization parameter for each segment within a frame to improve the
-   * subjective quality. This control makes encoder operate in one of the
-   * several AQ_modes supported.
-   *
-   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_AQ_MODE,
-
-  /*!\brief Codec control function to enable/disable periodic Q boost.
-   *
-   * One AV1 encoder speed feature is to enable quality boost by lowering
-   * frame level Q periodically. This control function provides a mean to
-   * turn on/off this feature.
-   *               0 = off
-   *               1 = on
-   *
-   * By default, the encoder is allowed to use this feature for appropriate
-   * encoding modes.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_FRAME_PERIODIC_BOOST,
-
-  /*!\brief Codec control function to set noise sensitivity.
-   *
-   *  0: off, 1: On(YOnly)
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_NOISE_SENSITIVITY,
-
-  /*!\brief Codec control function to set content type.
-   * \note Valid parameter range:
-   *              AOM_CONTENT_DEFAULT = Regular video content (Default)
-   *              AOM_CONTENT_SCREEN  = Screen capture content
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_TUNE_CONTENT,
-
-  /*!\brief Codec control function to set color space info.
-   * \note Valid ranges: 0..7, default is "UNKNOWN".
-   *                     0 = UNKNOWN,
-   *                     1 = BT_601
-   *                     2 = BT_709
-   *                     3 = SMPTE_170
-   *                     4 = SMPTE_240
-   *                     5 = BT_2020
-   *                     6 = RESERVED
-   *                     7 = SRGB
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_COLOR_SPACE,
-
-  /*!\brief Codec control function to set minimum interval between GF/ARF frames
-   *
-   * By default the value is set as 4.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_MIN_GF_INTERVAL,
-
-  /*!\brief Codec control function to set minimum interval between GF/ARF frames
-   *
-   * By default the value is set as 16.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_MAX_GF_INTERVAL,
-
-  /*!\brief Codec control function to get an Active map back from the encoder.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_GET_ACTIVEMAP,
-
-  /*!\brief Codec control function to set color range bit.
-   * \note Valid ranges: 0..1, default is 0
-   *                     0 = Limited range (16..235 or HBD equivalent)
-   *                     1 = Full range (0..255 or HBD equivalent)
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_COLOR_RANGE,
-
-  /*!\brief Codec control function to set intended rendering image size.
-   *
-   * By default, this is identical to the image size in pixels.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_RENDER_SIZE,
-
-  /*!\brief Codec control function to set target level.
-   *
-   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
-   * 11: target for level 1.1; ... 62: target for level 6.2
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_TARGET_LEVEL,
-
-  /*!\brief Codec control function to get bitstream level.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_GET_LEVEL,
-
-  /*!\brief Codec control function to set intended superblock size.
-   *
-   * By default, the superblock size is determined separately for each
-   * frame by the encoder.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_SUPERBLOCK_SIZE,
-};
-
-/*!\brief aom 1-D scaling mode
- *
- * This set of constants define 1-D aom scaling modes
- */
-typedef enum aom_scaling_mode_1d {
-  AOME_NORMAL = 0,
-  AOME_FOURFIVE = 1,
-  AOME_THREEFIVE = 2,
-  AOME_ONETWO = 3
-} AOM_SCALING_MODE;
-
-/*!\brief  aom region of interest map
- *
- * These defines the data structures for the region of interest map
- *
- */
-
-typedef struct aom_roi_map {
-  /*! An id between 0 and 3 for each 16x16 region within a frame. */
-  unsigned char *roi_map;
-  unsigned int rows; /**< Number of rows. */
-  unsigned int cols; /**< Number of columns. */
-  // TODO(paulwilkins): broken for AV1 which has 8 segments
-  // q and loop filter deltas for each segment
-  // (see MAX_MB_SEGMENTS)
-  int delta_q[4];  /**< Quantizer deltas. */
-  int delta_lf[4]; /**< Loop filter deltas. */
-  /*! Static breakout threshold for each segment. */
-  unsigned int static_threshold[4];
-} aom_roi_map_t;
-
-/*!\brief  aom active region map
- *
- * These defines the data structures for active region map
- *
- */
-
-typedef struct aom_active_map {
-  /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
-  unsigned char *active_map;
-  unsigned int rows; /**< number of rows */
-  unsigned int cols; /**< number of cols */
-} aom_active_map_t;
-
-/*!\brief  aom image scaling mode
- *
- * This defines the data structure for image scaling mode
- *
- */
-typedef struct aom_scaling_mode {
-  AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
-  AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode   */
-} aom_scaling_mode_t;
-
-/*!\brief VP8 token partition mode
- *
- * This defines VP8 partitioning mode for compressed data, i.e., the number of
- * sub-streams in the bitstream. Used for parallelized decoding.
- *
- */
-
-typedef enum {
-  AOM_ONE_TOKENPARTITION = 0,
-  AOM_TWO_TOKENPARTITION = 1,
-  AOM_FOUR_TOKENPARTITION = 2,
-  AOM_EIGHT_TOKENPARTITION = 3
-} aome_token_partitions;
-
-/*!brief AV1 encoder content type */
-typedef enum {
-  AOM_CONTENT_DEFAULT,
-  AOM_CONTENT_SCREEN,
-  AOM_CONTENT_INVALID
-} aom_tune_content;
-
-/*!\brief VP8 model tuning parameters
- *
- * Changes the encoder to tune for certain types of input material.
- *
- */
-typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
-
-/*!\cond */
-/*!\brief VP8 encoder control function parameter type
- *
- * Defines the data types that VP8E control functions take. Note that
- * additional common controls are defined in aom.h
- *
- */
-
-AOM_CTRL_USE_TYPE_DEPRECATED(AOME_USE_REFERENCE, int)
-#define AOM_CTRL_AOME_USE_REFERENCE
-AOM_CTRL_USE_TYPE(AOME_SET_FRAME_FLAGS, int)
-#define AOM_CTRL_AOME_SET_FRAME_FLAGS
-AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
-#define AOM_CTRL_AOME_SET_ROI_MAP
-AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
-#define AOM_CTRL_AOME_SET_ACTIVEMAP
-AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
-#define AOM_CTRL_AOME_SET_SCALEMODE
-
-AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
-#define AOM_CTRL_AOME_SET_CPUUSED
-AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
-#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
-
-#if CONFIG_EXT_REFS
-AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
-#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
-#endif  // CONFIG_EXT_REFS
-
-AOM_CTRL_USE_TYPE(AOME_SET_NOISE_SENSITIVITY, unsigned int)
-#define AOM_CTRL_AOME_SET_NOISE_SENSITIVITY
-AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
-#define AOM_CTRL_AOME_SET_SHARPNESS
-AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
-#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
-AOM_CTRL_USE_TYPE(AOME_SET_TOKEN_PARTITIONS, int) /* aome_token_partitions */
-#define AOM_CTRL_AOME_SET_TOKEN_PARTITIONS
-
-AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
-#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
-AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
-#define AOM_CTRL_AOME_SET_ARNR_STRENGTH
-AOM_CTRL_USE_TYPE_DEPRECATED(AOME_SET_ARNR_TYPE, unsigned int)
-#define AOM_CTRL_AOME_SET_ARNR_TYPE
-AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
-#define AOM_CTRL_AOME_SET_TUNING
-AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
-#define AOM_CTRL_AOME_SET_CQ_LEVEL
-
-AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
-#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
-AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
-#define AOM_CTRL_AV1E_SET_TILE_ROWS
-
-AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
-#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
-AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
-#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
-
-AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
-#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
-AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
-#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
-
-AOM_CTRL_USE_TYPE(AOME_SET_SCREEN_CONTENT_MODE, unsigned int)
-#define AOM_CTRL_AOME_SET_SCREEN_CONTENT_MODE
-
-AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
-
-AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
-#define AOM_CTRL_AV1E_SET_LOSSLESS
-
-#if CONFIG_AOM_QM
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
-#define AOM_CTRL_AV1E_SET_ENABLE_QM
-
-AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
-#define AOM_CTRL_AV1E_SET_QM_MIN
-
-AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
-#define AOM_CTRL_AV1E_SET_QM_MAX
-#endif
-
-AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
-#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
-
-AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
-#define AOM_CTRL_AV1E_SET_AQ_MODE
-
-AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
-#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
-
-AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
-#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
-
-AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
-#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
-
-AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_SPACE, int)
-#define AOM_CTRL_AV1E_SET_COLOR_SPACE
-
-AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
-#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
-
-AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
-#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
-
-AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
-#define AOM_CTRL_AV1E_GET_ACTIVEMAP
-
-AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
-#define AOM_CTRL_AV1E_SET_COLOR_RANGE
-
-/*!\brief
- *
- * TODO(rbultje) : add support of the control in ffmpeg
- */
-#define AOM_CTRL_AV1E_SET_RENDER_SIZE
-AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
-
-AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
-#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
-
-AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
-#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
-
-AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
-#define AOM_CTRL_AV1E_GET_LEVEL
-/*!\endcond */
-/*! @} - end defgroup vp8_encoder */
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOMCX_H_
--- a/aom/aomdx.h
+++ b/aom/aomdx.h
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
- * \ingroup aom
- *
- * @{
- */
-/*!\file
- * \brief Provides definitions for using AOM or AV1 within the aom Decoder
- *        interface.
- */
-#ifndef AOM_AOMDX_H_
-#define AOM_AOMDX_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Include controls common to both the encoder and decoder */
-#include "./aom.h"
-
-/*!\name Algorithm interface for AV1
- *
- * This interface provides the capability to decode AV1 streams.
- * @{
- */
-extern aom_codec_iface_t aom_codec_av1_dx_algo;
-extern aom_codec_iface_t *aom_codec_av1_dx(void);
-/*!@} - end algorithm interface member group*/
-
-/** Data structure that stores bit accounting for debug
- */
-typedef struct Accounting Accounting;
-
-/*!\enum aom_dec_control_id
- * \brief AOM decoder control functions
- *
- * This set of macros define the control functions available for the AOM
- * decoder interface.
- *
- * \sa #aom_codec_control
- */
-enum aom_dec_control_id {
-  /** control function to get info on which reference frames were updated
-   *  by the last decode
-   */
-  AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
-
-  /** check if the indicated frame is corrupted */
-  AOMD_GET_FRAME_CORRUPTED,
-
-  /** control function to get info on which reference frames were used
-   *  by the last decode
-   */
-  AOMD_GET_LAST_REF_USED,
-
-  /** decryption function to decrypt encoded buffer data immediately
-   * before decoding. Takes a aom_decrypt_init, which contains
-   * a callback function and opaque context pointer.
-   */
-  AOMD_SET_DECRYPTOR,
-  // AOMD_SET_DECRYPTOR = AOMD_SET_DECRYPTOR,
-
-  /** control function to get the dimensions that the current frame is decoded
-   * at. This may be different to the intended display size for the frame as
-   * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
-  AV1D_GET_FRAME_SIZE,
-
-  /** control function to get the current frame's intended display dimensions
-   * (as specified in the wrapper or frame header). This may be different to
-   * the decoded dimensions of this frame (see AV1D_GET_FRAME_SIZE). */
-  AV1D_GET_DISPLAY_SIZE,
-
-  /** control function to get the bit depth of the stream. */
-  AV1D_GET_BIT_DEPTH,
-
-  /** control function to set the byte alignment of the planes in the reference
-   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
-   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
-   * follows Y plane, and V plane directly follows U plane. Default value is 0.
-   */
-  AV1_SET_BYTE_ALIGNMENT,
-
-  /** control function to invert the decoding order to from right to left. The
-   * function is used in a test to confirm the decoding independence of tile
-   * columns. The function may be used in application where this order
-   * of decoding is desired.
-   *
-   * TODO(yaowu): Rework the unit test that uses this control, and in a future
-   *              release, this test-only control shall be removed.
-   */
-  AV1_INVERT_TILE_DECODE_ORDER,
-
-  /** control function to set the skip loop filter flag. Valid values are
-   * integers. The decoder will skip the loop filter when its value is set to
-   * nonzero. If the loop filter is skipped the decoder may accumulate decode
-   * artifacts. The default value is 0.
-   */
-  AV1_SET_SKIP_LOOP_FILTER,
-
-  /** control function to retrieve a pointer to the Accounting struct.  When
-   * compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
-   * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
-   * The caller should ensure that AOM_CODEC_OK is returned before attempting
-   * to dereference the Accounting pointer.
-   */
-  AV1_GET_ACCOUNTING,
-
-  AOM_DECODER_CTRL_ID_MAX,
-
-  /** control function to set the range of tile decoding. A value that is
-   * greater and equal to zero indicates only the specific row/column is
-   * decoded. A value that is -1 indicates the whole row/column is decoded.
-   * A special case is both values are -1 that means the whole frame is
-   * decoded.
-   */
-  AV1_SET_DECODE_TILE_ROW,
-  AV1_SET_DECODE_TILE_COL
-};
-
-/** Decrypt n bytes of data from input -> output, using the decrypt_state
- *  passed in AOMD_SET_DECRYPTOR.
- */
-typedef void (*aom_decrypt_cb)(void *decrypt_state, const unsigned char *input,
-                               unsigned char *output, int count);
-
-/*!\brief Structure to hold decryption state
- *
- * Defines a structure to hold the decryption state and access function.
- */
-typedef struct aom_decrypt_init {
-  /*! Decrypt callback. */
-  aom_decrypt_cb decrypt_cb;
-
-  /*! Decryption state. */
-  void *decrypt_state;
-} aom_decrypt_init;
-
-/*!\brief A deprecated alias for aom_decrypt_init.
- */
-typedef aom_decrypt_init aom_decrypt_init;
-
-/*!\cond */
-/*!\brief AOM decoder control function parameter type
- *
- * Defines the data types that AOMD control functions take. Note that
- * additional common controls are defined in aom.h
- *
- */
-
-AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
-#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
-AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
-#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
-AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
-#define AOM_CTRL_AOMD_GET_LAST_REF_USED
-AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
-#define AOM_CTRL_AOMD_SET_DECRYPTOR
-// AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
-//#define AOM_CTRL_AOMD_SET_DECRYPTOR
-AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
-#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
-AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
-#define AOM_CTRL_AV1D_GET_BIT_DEPTH
-AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
-#define AOM_CTRL_AV1D_GET_FRAME_SIZE
-AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
-#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
-AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
-#define AOM_CTRL_AV1_GET_ACCOUNTING
-AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
-#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
-AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
-#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
-/*!\endcond */
-/*! @} - end defgroup aom_decoder */
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOMDX_H_
--- a/aom/exports_com
+++ b/aom/exports_com
@@ -1,16 +0,0 @@
-text aom_codec_build_config
-text aom_codec_control_
-text aom_codec_destroy
-text aom_codec_err_to_string
-text aom_codec_error
-text aom_codec_error_detail
-text aom_codec_get_caps
-text aom_codec_iface_name
-text aom_codec_version
-text aom_codec_version_extra_str
-text aom_codec_version_str
-text aom_img_alloc
-text aom_img_flip
-text aom_img_free
-text aom_img_set_rect
-text aom_img_wrap
--- a/aom/exports_dec
+++ b/aom/exports_dec
@@ -1,8 +0,0 @@
-text aom_codec_dec_init_ver
-text aom_codec_decode
-text aom_codec_get_frame
-text aom_codec_get_stream_info
-text aom_codec_peek_stream_info
-text aom_codec_register_put_frame_cb
-text aom_codec_register_put_slice_cb
-text aom_codec_set_frame_buffer_functions
--- a/aom/exports_enc
+++ b/aom/exports_enc
@@ -1,9 +0,0 @@
-text aom_codec_enc_config_default
-text aom_codec_enc_config_set
-text aom_codec_enc_init_multi_ver
-text aom_codec_enc_init_ver
-text aom_codec_encode
-text aom_codec_get_cx_data
-text aom_codec_get_global_headers
-text aom_codec_get_preview_frame
-text aom_codec_set_cx_data_buf
--- a/aom/src/aom_codec.c
+++ b/aom/src/aom_codec.c
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief Provides the high level interface to wrap decoder algorithms.
- *
- */
-#include <stdarg.h>
-#include <stdlib.h>
-#include "aom/aom_integer.h"
-#include "aom/internal/aom_codec_internal.h"
-#include "aom_version.h"
-
-#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
-
-int aom_codec_version(void) { return VERSION_PACKED; }
-
-const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
-
-const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
-
-const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
-  return iface ? iface->name : "<invalid interface>";
-}
-
-const char *aom_codec_err_to_string(aom_codec_err_t err) {
-  switch (err) {
-    case AOM_CODEC_OK: return "Success";
-    case AOM_CODEC_ERROR: return "Unspecified internal error";
-    case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
-    case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
-    case AOM_CODEC_INCAPABLE:
-      return "Codec does not implement requested capability";
-    case AOM_CODEC_UNSUP_BITSTREAM:
-      return "Bitstream not supported by this decoder";
-    case AOM_CODEC_UNSUP_FEATURE:
-      return "Bitstream required feature not supported by this decoder";
-    case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
-    case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
-    case AOM_CODEC_LIST_END: return "End of iterated list";
-  }
-
-  return "Unrecognized error code";
-}
-
-const char *aom_codec_error(aom_codec_ctx_t *ctx) {
-  return (ctx) ? aom_codec_err_to_string(ctx->err)
-               : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
-}
-
-const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
-  if (ctx && ctx->err)
-    return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
-
-  return NULL;
-}
-
-aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
-  aom_codec_err_t res;
-
-  if (!ctx)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv)
-    res = AOM_CODEC_ERROR;
-  else {
-    ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
-
-    ctx->iface = NULL;
-    ctx->name = NULL;
-    ctx->priv = NULL;
-    res = AOM_CODEC_OK;
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
-  return (iface) ? iface->caps : 0;
-}
-
-aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
-  aom_codec_err_t res;
-
-  if (!ctx || !ctrl_id)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
-    res = AOM_CODEC_ERROR;
-  else {
-    aom_codec_ctrl_fn_map_t *entry;
-
-    res = AOM_CODEC_ERROR;
-
-    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
-      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
-        va_list ap;
-
-        va_start(ap, ctrl_id);
-        res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
-        va_end(ap);
-        break;
-      }
-    }
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-void aom_internal_error(struct aom_internal_error_info *info,
-                        aom_codec_err_t error, const char *fmt, ...) {
-  va_list ap;
-
-  info->error_code = error;
-  info->has_detail = 0;
-
-  if (fmt) {
-    size_t sz = sizeof(info->detail);
-
-    info->has_detail = 1;
-    va_start(ap, fmt);
-    vsnprintf(info->detail, sz - 1, fmt, ap);
-    va_end(ap);
-    info->detail[sz - 1] = '\0';
-  }
-
-  if (info->setjmp) longjmp(info->jmp, info->error_code);
-}
--- a/aom/src/aom_decoder.c
+++ b/aom/src/aom_decoder.c
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief Provides the high level interface to wrap decoder algorithms.
- *
- */
-#include <string.h>
-#include "aom/internal/aom_codec_internal.h"
-
-#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
-
-static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
-  return (aom_codec_alg_priv_t *)ctx->priv;
-}
-
-aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
-                                       aom_codec_iface_t *iface,
-                                       const aom_codec_dec_cfg_t *cfg,
-                                       aom_codec_flags_t flags, int ver) {
-  aom_codec_err_t res;
-
-  if (ver != AOM_DECODER_ABI_VERSION)
-    res = AOM_CODEC_ABI_MISMATCH;
-  else if (!ctx || !iface)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
-    res = AOM_CODEC_ABI_MISMATCH;
-  else if ((flags & AOM_CODEC_USE_POSTPROC) &&
-           !(iface->caps & AOM_CODEC_CAP_POSTPROC))
-    res = AOM_CODEC_INCAPABLE;
-  else if ((flags & AOM_CODEC_USE_ERROR_CONCEALMENT) &&
-           !(iface->caps & AOM_CODEC_CAP_ERROR_CONCEALMENT))
-    res = AOM_CODEC_INCAPABLE;
-  else if ((flags & AOM_CODEC_USE_INPUT_FRAGMENTS) &&
-           !(iface->caps & AOM_CODEC_CAP_INPUT_FRAGMENTS))
-    res = AOM_CODEC_INCAPABLE;
-  else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
-    res = AOM_CODEC_INCAPABLE;
-  else {
-    memset(ctx, 0, sizeof(*ctx));
-    ctx->iface = iface;
-    ctx->name = iface->name;
-    ctx->priv = NULL;
-    ctx->init_flags = flags;
-    ctx->config.dec = cfg;
-
-    res = ctx->iface->init(ctx, NULL);
-    if (res) {
-      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
-      aom_codec_destroy(ctx);
-    }
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
-                                           const uint8_t *data,
-                                           unsigned int data_sz,
-                                           aom_codec_stream_info_t *si) {
-  aom_codec_err_t res;
-
-  if (!iface || !data || !data_sz || !si ||
-      si->sz < sizeof(aom_codec_stream_info_t))
-    res = AOM_CODEC_INVALID_PARAM;
-  else {
-    /* Set default/unknown values */
-    si->w = 0;
-    si->h = 0;
-
-    res = iface->dec.peek_si(data, data_sz, si);
-  }
-
-  return res;
-}
-
-aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
-                                          aom_codec_stream_info_t *si) {
-  aom_codec_err_t res;
-
-  if (!ctx || !si || si->sz < sizeof(aom_codec_stream_info_t))
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv)
-    res = AOM_CODEC_ERROR;
-  else {
-    /* Set default/unknown values */
-    si->w = 0;
-    si->h = 0;
-
-    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
-                                 unsigned int data_sz, void *user_priv,
-                                 long deadline) {
-  aom_codec_err_t res;
-
-  /* Sanity checks */
-  /* NULL data ptr allowed if data_sz is 0 too */
-  if (!ctx || (!data && data_sz) || (data && !data_sz))
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv)
-    res = AOM_CODEC_ERROR;
-  else {
-    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
-                                 deadline);
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
-  aom_image_t *img;
-
-  if (!ctx || !iter || !ctx->iface || !ctx->priv)
-    img = NULL;
-  else
-    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
-
-  return img;
-}
-
-aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
-                                                aom_codec_put_frame_cb_fn_t cb,
-                                                void *user_priv) {
-  aom_codec_err_t res;
-
-  if (!ctx || !cb)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv ||
-           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_FRAME))
-    res = AOM_CODEC_ERROR;
-  else {
-    ctx->priv->dec.put_frame_cb.u.put_frame = cb;
-    ctx->priv->dec.put_frame_cb.user_priv = user_priv;
-    res = AOM_CODEC_OK;
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
-                                                aom_codec_put_slice_cb_fn_t cb,
-                                                void *user_priv) {
-  aom_codec_err_t res;
-
-  if (!ctx || !cb)
-    res = AOM_CODEC_INVALID_PARAM;
-  else if (!ctx->iface || !ctx->priv ||
-           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_SLICE))
-    res = AOM_CODEC_ERROR;
-  else {
-    ctx->priv->dec.put_slice_cb.u.put_slice = cb;
-    ctx->priv->dec.put_slice_cb.user_priv = user_priv;
-    res = AOM_CODEC_OK;
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
-aom_codec_err_t aom_codec_set_frame_buffer_functions(
-    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
-    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
-  aom_codec_err_t res;
-
-  if (!ctx || !cb_get || !cb_release) {
-    res = AOM_CODEC_INVALID_PARAM;
-  } else if (!ctx->iface || !ctx->priv ||
-             !(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
-    res = AOM_CODEC_ERROR;
-  } else {
-    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
-                                    cb_priv);
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom/aom_image.h"
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-
-static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
-                                     unsigned int d_w, unsigned int d_h,
-                                     unsigned int buf_align,
-                                     unsigned int stride_align,
-                                     unsigned char *img_data) {
-  unsigned int h, w, s, xcs, ycs, bps;
-  unsigned int stride_in_bytes;
-  int align;
-
-  /* Treat align==0 like align==1 */
-  if (!buf_align) buf_align = 1;
-
-  /* Validate alignment (must be power of 2) */
-  if (buf_align & (buf_align - 1)) goto fail;
-
-  /* Treat align==0 like align==1 */
-  if (!stride_align) stride_align = 1;
-
-  /* Validate alignment (must be power of 2) */
-  if (stride_align & (stride_align - 1)) goto fail;
-
-  /* Get sample size for this format */
-  switch (fmt) {
-    case AOM_IMG_FMT_RGB32:
-    case AOM_IMG_FMT_RGB32_LE:
-    case AOM_IMG_FMT_ARGB:
-    case AOM_IMG_FMT_ARGB_LE: bps = 32; break;
-    case AOM_IMG_FMT_RGB24:
-    case AOM_IMG_FMT_BGR24: bps = 24; break;
-    case AOM_IMG_FMT_RGB565:
-    case AOM_IMG_FMT_RGB565_LE:
-    case AOM_IMG_FMT_RGB555:
-    case AOM_IMG_FMT_RGB555_LE:
-    case AOM_IMG_FMT_UYVY:
-    case AOM_IMG_FMT_YUY2:
-    case AOM_IMG_FMT_YVYU: bps = 16; break;
-    case AOM_IMG_FMT_I420:
-    case AOM_IMG_FMT_YV12:
-    case AOM_IMG_FMT_AOMI420:
-    case AOM_IMG_FMT_AOMYV12: bps = 12; break;
-    case AOM_IMG_FMT_I422:
-    case AOM_IMG_FMT_I440: bps = 16; break;
-    case AOM_IMG_FMT_I444: bps = 24; break;
-    case AOM_IMG_FMT_I42016: bps = 24; break;
-    case AOM_IMG_FMT_I42216:
-    case AOM_IMG_FMT_I44016: bps = 32; break;
-    case AOM_IMG_FMT_I44416: bps = 48; break;
-    default: bps = 16; break;
-  }
-
-  /* Get chroma shift values for this format */
-  switch (fmt) {
-    case AOM_IMG_FMT_I420:
-    case AOM_IMG_FMT_YV12:
-    case AOM_IMG_FMT_AOMI420:
-    case AOM_IMG_FMT_AOMYV12:
-    case AOM_IMG_FMT_I422:
-    case AOM_IMG_FMT_I42016:
-    case AOM_IMG_FMT_I42216: xcs = 1; break;
-    default: xcs = 0; break;
-  }
-
-  switch (fmt) {
-    case AOM_IMG_FMT_I420:
-    case AOM_IMG_FMT_I440:
-    case AOM_IMG_FMT_YV12:
-    case AOM_IMG_FMT_AOMI420:
-    case AOM_IMG_FMT_AOMYV12:
-    case AOM_IMG_FMT_I42016:
-    case AOM_IMG_FMT_I44016: ycs = 1; break;
-    default: ycs = 0; break;
-  }
-
-  /* Calculate storage sizes given the chroma subsampling */
-  align = (1 << xcs) - 1;
-  w = (d_w + align) & ~align;
-  align = (1 << ycs) - 1;
-  h = (d_h + align) & ~align;
-  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
-  s = (s + stride_align - 1) & ~(stride_align - 1);
-  stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
-
-  /* Allocate the new image */
-  if (!img) {
-    img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
-
-    if (!img) goto fail;
-
-    img->self_allocd = 1;
-  } else {
-    memset(img, 0, sizeof(aom_image_t));
-  }
-
-  img->img_data = img_data;
-
-  if (!img_data) {
-    const uint64_t alloc_size = (fmt & AOM_IMG_FMT_PLANAR)
-                                    ? (uint64_t)h * s * bps / 8
-                                    : (uint64_t)h * s;
-
-    if (alloc_size != (size_t)alloc_size) goto fail;
-
-    img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
-    img->img_data_owner = 1;
-  }
-
-  if (!img->img_data) goto fail;
-
-  img->fmt = fmt;
-  img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
-  img->w = w;
-  img->h = h;
-  img->x_chroma_shift = xcs;
-  img->y_chroma_shift = ycs;
-  img->bps = bps;
-
-  /* Calculate strides */
-  img->stride[AOM_PLANE_Y] = img->stride[AOM_PLANE_ALPHA] = stride_in_bytes;
-  img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
-
-  /* Default viewport to entire image */
-  if (!aom_img_set_rect(img, 0, 0, d_w, d_h)) return img;
-
-fail:
-  aom_img_free(img);
-  return NULL;
-}
-
-aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
-                           unsigned int d_w, unsigned int d_h,
-                           unsigned int align) {
-  return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
-}
-
-aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
-                          unsigned int d_h, unsigned int stride_align,
-                          unsigned char *img_data) {
-  /* By setting buf_align = 1, we don't change buffer alignment in this
-   * function. */
-  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
-}
-
-int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
-                     unsigned int w, unsigned int h) {
-  unsigned char *data;
-
-  if (x + w <= img->w && y + h <= img->h) {
-    img->d_w = w;
-    img->d_h = h;
-
-    /* Calculate plane pointers */
-    if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
-      img->planes[AOM_PLANE_PACKED] =
-          img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
-    } else {
-      const int bytes_per_sample =
-          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
-      data = img->img_data;
-
-      if (img->fmt & AOM_IMG_FMT_HAS_ALPHA) {
-        img->planes[AOM_PLANE_ALPHA] =
-            data + x * bytes_per_sample + y * img->stride[AOM_PLANE_ALPHA];
-        data += img->h * img->stride[AOM_PLANE_ALPHA];
-      }
-
-      img->planes[AOM_PLANE_Y] =
-          data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
-      data += img->h * img->stride[AOM_PLANE_Y];
-
-      if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
-        img->planes[AOM_PLANE_U] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
-        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
-        img->planes[AOM_PLANE_V] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
-      } else {
-        img->planes[AOM_PLANE_V] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
-        data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
-        img->planes[AOM_PLANE_U] =
-            data + (x >> img->x_chroma_shift) * bytes_per_sample +
-            (y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
-      }
-    }
-    return 0;
-  }
-  return -1;
-}
-
-void aom_img_flip(aom_image_t *img) {
-  /* Note: In the calculation pointer adjustment calculation, we want the
-   * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
-   * standard indicates that if the adjustment parameter is unsigned, the
-   * stride parameter will be promoted to unsigned, causing errors when
-   * the lhs is a larger type than the rhs.
-   */
-  img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
-  img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
-
-  img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
-                              img->stride[AOM_PLANE_U];
-  img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
-
-  img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
-                              img->stride[AOM_PLANE_V];
-  img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
-
-  img->planes[AOM_PLANE_ALPHA] +=
-      (signed)(img->d_h - 1) * img->stride[AOM_PLANE_ALPHA];
-  img->stride[AOM_PLANE_ALPHA] = -img->stride[AOM_PLANE_ALPHA];
-}
-
-void aom_img_free(aom_image_t *img) {
-  if (img) {
-    if (img->img_data && img->img_data_owner) aom_free(img->img_data);
-
-    if (img->self_allocd) free(img);
-  }
-}
--- a/aom_dsp/ans.c
+++ b/aom_dsp/ans.c
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/prob.h"
-
-static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
-  int largest_idx = -1;
-  int largest_p = -1;
-  int i;
-  for (i = 0; i < num_syms; ++i) {
-    int p = pdf_tab[i];
-    if (p > largest_p) {
-      largest_p = p;
-      largest_idx = i;
-    }
-  }
-  return largest_idx;
-}
-
-void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
-                              const AnsP8 node_prob,
-                              const aom_cdf_prob *const src_pdf, int in_syms) {
-  int i;
-  int adjustment = RANS_PRECISION;
-  const int round_fact = ANS_P8_PRECISION >> 1;
-  const AnsP8 p1 = ANS_P8_PRECISION - node_prob;
-  const int out_syms = in_syms + 1;
-  assert(src_pdf != out_pdf);
-
-  out_pdf[0] = node_prob << (RANS_PROB_BITS - ANS_P8_SHIFT);
-  adjustment -= out_pdf[0];
-  for (i = 0; i < in_syms; ++i) {
-    int p = (p1 * src_pdf[i] + round_fact) >> ANS_P8_SHIFT;
-    p = AOMMIN(p, (int)RANS_PRECISION - in_syms);
-    p = AOMMAX(p, 1);
-    out_pdf[i + 1] = p;
-    adjustment -= p;
-  }
-
-  // Adjust probabilities so they sum to the total probability
-  if (adjustment > 0) {
-    i = find_largest(out_pdf, out_syms);
-    out_pdf[i] += adjustment;
-  } else {
-    while (adjustment < 0) {
-      i = find_largest(out_pdf, out_syms);
-      --out_pdf[i];
-      assert(out_pdf[i] > 0);
-      adjustment++;
-    }
-  }
-}
--- a/aom_dsp/ans.h
+++ b/aom_dsp/ans.h
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANS_H_
-#define AOM_DSP_ANS_H_
-// Constants, types and utilities for Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/prob.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-typedef uint8_t AnsP8;
-#define ANS_P8_PRECISION 256u
-#define ANS_P8_SHIFT 8
-#define RANS_PROB_BITS 15
-#define RANS_PRECISION (1u << RANS_PROB_BITS)
-
-// L_BASE % PRECISION must be 0. Increasing L_BASE beyond 2**15 will cause uabs
-// to overflow.
-#define L_BASE (RANS_PRECISION)
-#define IO_BASE 256
-// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
-
-void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
-                              const AnsP8 node_prob,
-                              const aom_cdf_prob *const src_pdf, int in_syms);
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANS_H_
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANSREADER_H_
-#define AOM_DSP_ANSREADER_H_
-// A uABS and rANS decoder implementation of Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/prob.h"
-#include "aom_dsp/ans.h"
-#include "aom_ports/mem_ops.h"
-#if CONFIG_ACCOUNTING
-#include "av1/common/accounting.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-struct AnsDecoder {
-  const uint8_t *buf;
-  int buf_offset;
-  uint32_t state;
-#if CONFIG_ACCOUNTING
-  Accounting *accounting;
-#endif
-};
-
-static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
-  AnsP8 p = ANS_P8_PRECISION - p0;
-  int s;
-  unsigned xp, sp;
-  unsigned state = ans->state;
-  while (state < L_BASE && ans->buf_offset > 0) {
-    state = state * IO_BASE + ans->buf[--ans->buf_offset];
-  }
-  sp = state * p;
-  xp = sp / ANS_P8_PRECISION;
-  s = (sp & 0xFF) >= p0;
-  if (s)
-    ans->state = xp;
-  else
-    ans->state = state - xp;
-  return s;
-}
-
-static INLINE int uabs_read_bit(struct AnsDecoder *ans) {
-  int s;
-  unsigned state = ans->state;
-  while (state < L_BASE && ans->buf_offset > 0) {
-    state = state * IO_BASE + ans->buf[--ans->buf_offset];
-  }
-  s = (int)(state & 1);
-  ans->state = state >> 1;
-  return s;
-}
-
-struct rans_dec_sym {
-  uint8_t val;
-  aom_cdf_prob prob;
-  aom_cdf_prob cum_prob;  // not-inclusive
-};
-
-static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
-                             aom_cdf_prob rem) {
-  int i;
-  aom_cdf_prob cum_prob = 0, top_prob;
-  // TODO(skal): if critical, could be a binary search.
-  // Or, better, an O(1) alias-table.
-  for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
-    cum_prob = top_prob;
-  }
-  out->val = i;
-  out->prob = top_prob - cum_prob;
-  out->cum_prob = cum_prob;
-}
-
-static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
-  unsigned rem;
-  unsigned quo;
-  struct rans_dec_sym sym;
-  while (ans->state < L_BASE && ans->buf_offset > 0) {
-    ans->state = ans->state * IO_BASE + ans->buf[--ans->buf_offset];
-  }
-  quo = ans->state / RANS_PRECISION;
-  rem = ans->state % RANS_PRECISION;
-  fetch_sym(&sym, tab, rem);
-  ans->state = quo * sym.prob + rem - sym.cum_prob;
-  return sym.val;
-}
-
-static INLINE int ans_read_init(struct AnsDecoder *const ans,
-                                const uint8_t *const buf, int offset) {
-  unsigned x;
-  if (offset < 1) return 1;
-  ans->buf = buf;
-  x = buf[offset - 1] >> 6;
-  if (x == 0) {
-    ans->buf_offset = offset - 1;
-    ans->state = buf[offset - 1] & 0x3F;
-  } else if (x == 1) {
-    if (offset < 2) return 1;
-    ans->buf_offset = offset - 2;
-    ans->state = mem_get_le16(buf + offset - 2) & 0x3FFF;
-  } else if (x == 2) {
-    if (offset < 3) return 1;
-    ans->buf_offset = offset - 3;
-    ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
-  } else if ((buf[offset - 1] & 0xE0) == 0xE0) {
-    if (offset < 4) return 1;
-    ans->buf_offset = offset - 4;
-    ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
-  } else {
-    // 110xxxxx implies this byte is a superframe marker
-    return 1;
-  }
-#if CONFIG_ACCOUNTING
-  ans->accounting = NULL;
-#endif
-  ans->state += L_BASE;
-  if (ans->state >= L_BASE * IO_BASE) return 1;
-  return 0;
-}
-
-static INLINE int ans_read_end(struct AnsDecoder *const ans) {
-  return ans->state == L_BASE;
-}
-
-static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
-  return ans->state < L_BASE && ans->buf_offset == 0;
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANSREADER_H_
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_ANSWRITER_H_
-#define AOM_DSP_ANSWRITER_H_
-// A uABS and rANS encoder implementation of Asymmetric Numeral Systems
-// http://arxiv.org/abs/1311.2540v2
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/prob.h"
-#include "aom_ports/mem_ops.h"
-#include "av1/common/odintrin.h"
-
-#if RANS_PRECISION <= OD_DIVU_DMAX
-#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
-  do {                                                     \
-    quotient = OD_DIVU_SMALL((dividend), (divisor));       \
-    remainder = (dividend) - (quotient) * (divisor);       \
-  } while (0)
-#else
-#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
-  do {                                                     \
-    quotient = (dividend) / (divisor);                     \
-    remainder = (dividend) % (divisor);                    \
-  } while (0)
-#endif
-
-#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-struct AnsCoder {
-  uint8_t *buf;
-  int buf_offset;
-  uint32_t state;
-};
-
-static INLINE void ans_write_init(struct AnsCoder *const ans,
-                                  uint8_t *const buf) {
-  ans->buf = buf;
-  ans->buf_offset = 0;
-  ans->state = L_BASE;
-}
-
-static INLINE int ans_write_end(struct AnsCoder *const ans) {
-  uint32_t state;
-  assert(ans->state >= L_BASE);
-  assert(ans->state < L_BASE * IO_BASE);
-  state = ans->state - L_BASE;
-  if (state < (1 << 6)) {
-    ans->buf[ans->buf_offset] = (0x00 << 6) + state;
-    return ans->buf_offset + 1;
-  } else if (state < (1 << 14)) {
-    mem_put_le16(ans->buf + ans->buf_offset, (0x01 << 14) + state);
-    return ans->buf_offset + 2;
-  } else if (state < (1 << 22)) {
-    mem_put_le24(ans->buf + ans->buf_offset, (0x02 << 22) + state);
-    return ans->buf_offset + 3;
-  } else if (state < (1 << 29)) {
-    mem_put_le32(ans->buf + ans->buf_offset, (0x07 << 29) + state);
-    return ans->buf_offset + 4;
-  } else {
-    assert(0 && "State is too large to be serialized");
-    return ans->buf_offset;
-  }
-}
-
-// uABS with normalization
-static INLINE void uabs_write(struct AnsCoder *ans, int val, AnsP8 p0) {
-  AnsP8 p = ANS_P8_PRECISION - p0;
-  const unsigned l_s = val ? p : p0;
-  while (ans->state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
-    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
-    ans->state /= IO_BASE;
-  }
-  if (!val)
-    ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
-  else
-    ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
-}
-
-struct rans_sym {
-  aom_cdf_prob prob;
-  aom_cdf_prob cum_prob;  // not-inclusive
-};
-
-// rANS with normalization
-// sym->prob takes the place of l_s from the paper
-// ANS_P10_PRECISION is m
-static INLINE void rans_write(struct AnsCoder *ans,
-                              const struct rans_sym *const sym) {
-  const aom_cdf_prob p = sym->prob;
-  unsigned quot, rem;
-  while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
-    ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
-    ans->state /= IO_BASE;
-  }
-  ANS_DIVREM(quot, rem, ans->state, p);
-  ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
-}
-
-#undef ANS_DIV8
-#undef ANS_DIVREM
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_ANSWRITER_H_
--- a/aom_dsp/aom_convolve.h
+++ b/aom_dsp/aom_convolve.h
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_DSP_AOM_CONVOLVE_H_
-#define AOM_DSP_AOM_CONVOLVE_H_
-
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Note: Fixed size intermediate buffers, place limits on parameters
-// of some functions. 2d filtering proceeds in 2 steps:
-//   (1) Interpolate horizontally into an intermediate buffer, temp.
-//   (2) Interpolate temp vertically to derive the sub-pixel result.
-// Deriving the maximum number of rows in the temp buffer (135):
-// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-// --Largest block size is 64x64 pixels.
-// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-//   original frame (in 1/16th pixel units).
-// --Must round-up because block may be located at sub-pixel position.
-// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
-#define MAX_EXT_SIZE 263
-#else
-#define MAX_EXT_SIZE 135
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
-
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h);
-
-#if CONFIG_AOM_HIGHBITDEPTH
-typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
-                                     int w, int h, int bd);
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_AOM_CONVOLVE_H_
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_AOM_DSP_COMMON_H_
-#define AOM_DSP_AOM_DSP_COMMON_H_
-
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef MAX_SB_SIZE
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
-#define MAX_SB_SIZE 128
-#else
-#define MAX_SB_SIZE 64
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
-#endif  // ndef MAX_SB_SIZE
-
-#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
-#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
-
-#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
-
-// These can be used to give a hint about branch outcomes.
-// This can have an effect, even if your target processor has a
-// good branch predictor, as these hints can affect basic block
-// ordering by the compiler.
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-#define AOM_SWAP(type, a, b) \
-  do {                       \
-    type c = (b);            \
-    b = a;                   \
-    a = c;                   \
-  } while (0)
-
-#if CONFIG_AOM_QM
-typedef uint16_t qm_val_t;
-#define AOM_QM_BITS 6
-#endif
-#if CONFIG_AOM_HIGHBITDEPTH
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-#else
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
-  switch (bd) {
-    case 8:
-    default: return (uint16_t)clamp(val, 0, 255);
-    case 10: return (uint16_t)clamp(val, 0, 1023);
-    case 12: return (uint16_t)clamp(val, 0, 4095);
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_AOM_DSP_COMMON_H_
--- a/aom_dsp/aom_dsp_rtcd.c
+++ b/aom_dsp/aom_dsp_rtcd.c
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "./aom_config.h"
-#define RTCD_C
-#include "./aom_dsp_rtcd.h"
-#include "aom_ports/aom_once.h"
-
-void aom_dsp_rtcd() { once(setup_rtcd_internal); }
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
--- a/aom_dsp/aom_filter.h
+++ b/aom_dsp/aom_filter.h
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_AOM_FILTER_H_
-#define AOM_DSP_AOM_FILTER_H_
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
-typedef int16_t InterpKernel[SUBPEL_TAPS];
-
-#define BIL_SUBPEL_BITS 3
-#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
-
-// 2 tap bilinear filters
-static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_AOM_FILTER_H_
--- a/aom_dsp/aom_simd.c
+++ b/aom_dsp/aom_simd.c
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-// Set to 1 to add some sanity checks in the fallback C code
-const int simd_check = 1;
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_AOM_AOM_SIMD_H_
-#define AOM_DSP_AOM_AOM_SIMD_H_
-
-#include <stdint.h>
-
-#if defined(_WIN32)
-#include <intrin.h>
-#endif
-
-#include "./aom_config.h"
-#include "./aom_simd_inline.h"
-
-#if HAVE_NEON
-#include "simd/v256_intrinsics_arm.h"
-#elif HAVE_SSE2
-#include "simd/v256_intrinsics_x86.h"
-#else
-#include "simd/v256_intrinsics.h"
-#endif
-
-#endif  // AOM_DSP_AOM_AOM_SIMD_H_
--- a/aom_dsp/aom_simd_inline.h
+++ b/aom_dsp/aom_simd_inline.h
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_AOM_SIMD_INLINE_H_
-#define AOM_DSP_AOM_SIMD_INLINE_H_
-
-#include "aom/aom_integer.h"
-
-#ifndef SIMD_INLINE
-#define SIMD_INLINE static AOM_FORCE_INLINE
-#endif
-
-#endif  // AOM_DSP_AOM_SIMD_INLINE_H_
--- a/aom_dsp/arm/bilinear_filter_media.asm
+++ b/aom_dsp/arm/bilinear_filter_media.asm
@@ -1,240 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_filter_block2d_bil_first_pass_media|
-    EXPORT  |aom_filter_block2d_bil_second_pass_media|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;-------------------------------------
-; r0    unsigned char  *src_ptr,
-; r1    unsigned short *dst_ptr,
-; r2    unsigned int    src_pitch,
-; r3    unsigned int    height,
-; stack unsigned int    width,
-; stack const short    *aom_filter
-;-------------------------------------
-; The output is transposed stroed in output array to make it easy for second pass filtering.
-|aom_filter_block2d_bil_first_pass_media| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; aom_filter address
-    ldr     r4, [sp, #36]                   ; width
-
-    mov     r12, r3                         ; outer-loop counter
-
-    add     r7, r2, r4                      ; preload next row
-    pld     [r0, r7]
-
-    sub     r2, r2, r4                      ; src increment for height loop
-
-    ldr     r5, [r11]                       ; load up filter coefficients
-
-    mov     r3, r3, lsl #1                  ; height*2
-    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
-
-    mov     r11, r1                         ; save dst_ptr for each row
-
-    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
-    beq     bil_null_1st_filter
-
-|bil_height_loop_1st_v6|
-    ldrb    r6, [r0]                        ; load source data
-    ldrb    r7, [r0, #1]
-    ldrb    r8, [r0, #2]
-    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
-
-|bil_width_loop_1st_v6|
-    ldrb    r9, [r0, #3]
-    ldrb    r10, [r0, #4]
-
-    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
-    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
-
-    smuad   r6, r6, r5                      ; apply the filter
-    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
-    smuad   r7, r7, r5
-    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
-
-    smuad   r8, r8, r5
-    smuad   r9, r9, r5
-
-    add     r0, r0, #4
-    subs    lr, lr, #1
-
-    add     r6, r6, #0x40                   ; round_shift_and_clamp
-    add     r7, r7, #0x40
-    usat    r6, #16, r6, asr #7
-    usat    r7, #16, r7, asr #7
-
-    strh    r6, [r1], r3                    ; result is transposed and stored
-
-    add     r8, r8, #0x40                   ; round_shift_and_clamp
-    strh    r7, [r1], r3
-    add     r9, r9, #0x40
-    usat    r8, #16, r8, asr #7
-    usat    r9, #16, r9, asr #7
-
-    strh    r8, [r1], r3                    ; result is transposed and stored
-
-    ldrneb  r6, [r0]                        ; load source data
-    strh    r9, [r1], r3
-
-    ldrneb  r7, [r0, #1]
-    ldrneb  r8, [r0, #2]
-
-    bne     bil_width_loop_1st_v6
-
-    add     r0, r0, r2                      ; move to next input row
-    subs    r12, r12, #1
-
-    add     r9, r2, r4, lsl #1              ; adding back block width
-    pld     [r0, r9]                        ; preload next row
-
-    add     r11, r11, #2                    ; move over to next column
-    mov     r1, r11
-
-    bne     bil_height_loop_1st_v6
-
-    ldmia   sp!, {r4 - r11, pc}
-
-|bil_null_1st_filter|
-|bil_height_loop_null_1st|
-    mov     lr, r4, lsr #2                  ; loop counter
-
-|bil_width_loop_null_1st|
-    ldrb    r6, [r0]                        ; load data
-    ldrb    r7, [r0, #1]
-    ldrb    r8, [r0, #2]
-    ldrb    r9, [r0, #3]
-
-    strh    r6, [r1], r3                    ; store it to immediate buffer
-    add     r0, r0, #4
-    strh    r7, [r1], r3
-    subs    lr, lr, #1
-    strh    r8, [r1], r3
-    strh    r9, [r1], r3
-
-    bne     bil_width_loop_null_1st
-
-    subs    r12, r12, #1
-    add     r0, r0, r2                      ; move to next input line
-    add     r11, r11, #2                    ; move over to next column
-    mov     r1, r11
-
-    bne     bil_height_loop_null_1st
-
-    ldmia   sp!, {r4 - r11, pc}
-
-    ENDP  ; |aom_filter_block2d_bil_first_pass_media|
-
-
-;---------------------------------
-; r0    unsigned short *src_ptr,
-; r1    unsigned char  *dst_ptr,
-; r2    int             dst_pitch,
-; r3    unsigned int    height,
-; stack unsigned int    width,
-; stack const short    *aom_filter
-;---------------------------------
-|aom_filter_block2d_bil_second_pass_media| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r11, [sp, #40]                  ; aom_filter address
-    ldr     r4, [sp, #36]                   ; width
-
-    ldr     r5, [r11]                       ; load up filter coefficients
-    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
-    mov     r11, r1
-
-    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
-    beq     bil_null_2nd_filter
-
-|bil_height_loop_2nd|
-    ldr     r6, [r0]                        ; load the data
-    ldr     r8, [r0, #4]
-    ldrh    r10, [r0, #8]
-    mov     lr, r3, lsr #2                  ; loop counter
-
-|bil_width_loop_2nd|
-    pkhtb   r7, r6, r8                      ; src[1] | src[2]
-    pkhtb   r9, r8, r10                     ; src[3] | src[4]
-
-    smuad   r6, r6, r5                      ; apply filter
-    smuad   r8, r8, r5                      ; apply filter
-
-    subs    lr, lr, #1
-
-    smuadx  r7, r7, r5                      ; apply filter
-    smuadx  r9, r9, r5                      ; apply filter
-
-    add     r0, r0, #8
-
-    add     r6, r6, #0x40                   ; round_shift_and_clamp
-    add     r7, r7, #0x40
-    usat    r6, #8, r6, asr #7
-    usat    r7, #8, r7, asr #7
-    strb    r6, [r1], r2                    ; the result is transposed back and stored
-
-    add     r8, r8, #0x40                   ; round_shift_and_clamp
-    strb    r7, [r1], r2
-    add     r9, r9, #0x40
-    usat    r8, #8, r8, asr #7
-    usat    r9, #8, r9, asr #7
-    strb    r8, [r1], r2                    ; the result is transposed back and stored
-
-    ldrne   r6, [r0]                        ; load data
-    strb    r9, [r1], r2
-    ldrne   r8, [r0, #4]
-    ldrneh  r10, [r0, #8]
-
-    bne     bil_width_loop_2nd
-
-    subs    r12, r12, #1
-    add     r0, r0, #4                      ; update src for next row
-    add     r11, r11, #1
-    mov     r1, r11
-
-    bne     bil_height_loop_2nd
-    ldmia   sp!, {r4 - r11, pc}
-
-|bil_null_2nd_filter|
-|bil_height_loop_null_2nd|
-    mov     lr, r3, lsr #2
-
-|bil_width_loop_null_2nd|
-    ldr     r6, [r0], #4                    ; load data
-    subs    lr, lr, #1
-    ldr     r8, [r0], #4
-
-    strb    r6, [r1], r2                    ; store data
-    mov     r7, r6, lsr #16
-    strb    r7, [r1], r2
-    mov     r9, r8, lsr #16
-    strb    r8, [r1], r2
-    strb    r9, [r1], r2
-
-    bne     bil_width_loop_null_2nd
-
-    subs    r12, r12, #1
-    add     r0, r0, #4
-    add     r11, r11, #1
-    mov     r1, r11
-
-    bne     bil_height_loop_null_2nd
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP  ; |aom_filter_block2d_second_pass_media|
-
-    END
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c
@@ -1,199 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-
-static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
-                                 int16x8_t *a6, int16x8_t *a7) {
-  const int16x8_t b0 = vaddq_s16(*a0, *a1);
-  const int16x8_t b1 = vsubq_s16(*a0, *a1);
-  const int16x8_t b2 = vaddq_s16(*a2, *a3);
-  const int16x8_t b3 = vsubq_s16(*a2, *a3);
-  const int16x8_t b4 = vaddq_s16(*a4, *a5);
-  const int16x8_t b5 = vsubq_s16(*a4, *a5);
-  const int16x8_t b6 = vaddq_s16(*a6, *a7);
-  const int16x8_t b7 = vsubq_s16(*a6, *a7);
-
-  const int16x8_t c0 = vaddq_s16(b0, b2);
-  const int16x8_t c1 = vaddq_s16(b1, b3);
-  const int16x8_t c2 = vsubq_s16(b0, b2);
-  const int16x8_t c3 = vsubq_s16(b1, b3);
-  const int16x8_t c4 = vaddq_s16(b4, b6);
-  const int16x8_t c5 = vaddq_s16(b5, b7);
-  const int16x8_t c6 = vsubq_s16(b4, b6);
-  const int16x8_t c7 = vsubq_s16(b5, b7);
-
-  *a0 = vaddq_s16(c0, c4);
-  *a1 = vsubq_s16(c2, c6);
-  *a2 = vsubq_s16(c0, c4);
-  *a3 = vaddq_s16(c2, c6);
-  *a4 = vaddq_s16(c3, c7);
-  *a5 = vsubq_s16(c3, c7);
-  *a6 = vsubq_s16(c1, c5);
-  *a7 = vaddq_s16(c1, c5);
-}
-
-// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
-// reversing transpose order which may make it easier for the compiler to
-// reconcile the vtrn.64 moves.
-static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
-                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
-                         int16x8_t *a6, int16x8_t *a7) {
-  // Swap 64 bit elements. Goes from:
-  // a0: 00 01 02 03 04 05 06 07
-  // a1: 08 09 10 11 12 13 14 15
-  // a2: 16 17 18 19 20 21 22 23
-  // a3: 24 25 26 27 28 29 30 31
-  // a4: 32 33 34 35 36 37 38 39
-  // a5: 40 41 42 43 44 45 46 47
-  // a6: 48 49 50 51 52 53 54 55
-  // a7: 56 57 58 59 60 61 62 63
-  // to:
-  // a04_lo: 00 01 02 03 32 33 34 35
-  // a15_lo: 08 09 10 11 40 41 42 43
-  // a26_lo: 16 17 18 19 48 49 50 51
-  // a37_lo: 24 25 26 27 56 57 58 59
-  // a04_hi: 04 05 06 07 36 37 38 39
-  // a15_hi: 12 13 14 15 44 45 46 47
-  // a26_hi: 20 21 22 23 52 53 54 55
-  // a37_hi: 28 29 30 31 60 61 62 63
-  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
-  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
-  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
-  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
-  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
-  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
-  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
-  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
-
-  // Swap 32 bit elements resulting in:
-  // a0246_lo:
-  // 00 01 16 17 32 33 48 49
-  // 02 03 18 19 34 35 50 51
-  // a1357_lo:
-  // 08 09 24 25 40 41 56 57
-  // 10 11 26 27 42 43 58 59
-  // a0246_hi:
-  // 04 05 20 21 36 37 52 53
-  // 06 07 22 23 38 39 54 55
-  // a1657_hi:
-  // 12 13 28 29 44 45 60 61
-  // 14 15 30 31 46 47 62 63
-  const int32x4x2_t a0246_lo =
-      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
-  const int32x4x2_t a1357_lo =
-      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
-  const int32x4x2_t a0246_hi =
-      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
-  const int32x4x2_t a1357_hi =
-      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
-
-  // Swap 16 bit elements resulting in:
-  // b0:
-  // 00 08 16 24 32 40 48 56
-  // 01 09 17 25 33 41 49 57
-  // b1:
-  // 02 10 18 26 34 42 50 58
-  // 03 11 19 27 35 43 51 59
-  // b2:
-  // 04 12 20 28 36 44 52 60
-  // 05 13 21 29 37 45 53 61
-  // b3:
-  // 06 14 22 30 38 46 54 62
-  // 07 15 23 31 39 47 55 63
-  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
-                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
-  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
-                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
-  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
-                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
-  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
-                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
-
-  *a0 = b0.val[0];
-  *a1 = b0.val[1];
-  *a2 = b1.val[0];
-  *a3 = b1.val[1];
-  *a4 = b2.val[0];
-  *a5 = b2.val[1];
-  *a6 = b3.val[0];
-  *a7 = b3.val[1];
-}
-
-void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
-                           int16_t *coeff) {
-  int16x8_t a0 = vld1q_s16(src_diff);
-  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
-  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
-  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
-  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
-  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
-  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
-  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-
-  // Skip the second transpose because it is not required.
-
-  vst1q_s16(coeff + 0, a0);
-  vst1q_s16(coeff + 8, a1);
-  vst1q_s16(coeff + 16, a2);
-  vst1q_s16(coeff + 24, a3);
-  vst1q_s16(coeff + 32, a4);
-  vst1q_s16(coeff + 40, a5);
-  vst1q_s16(coeff + 48, a6);
-  vst1q_s16(coeff + 56, a7);
-}
-
-void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
-                             int16_t *coeff) {
-  int i;
-
-  /* Rearrange 16x16 to 8x32 and remove stride.
-   * Top left first. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
-  /* Top right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
-  /* Bottom left. */
-  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
-  /* Bottom right. */
-  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
-
-  for (i = 0; i < 64; i += 8) {
-    const int16x8_t a0 = vld1q_s16(coeff + 0);
-    const int16x8_t a1 = vld1q_s16(coeff + 64);
-    const int16x8_t a2 = vld1q_s16(coeff + 128);
-    const int16x8_t a3 = vld1q_s16(coeff + 192);
-
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
-
-    const int16x8_t c0 = vaddq_s16(b0, b2);
-    const int16x8_t c1 = vaddq_s16(b1, b3);
-    const int16x8_t c2 = vsubq_s16(b0, b2);
-    const int16x8_t c3 = vsubq_s16(b1, b3);
-
-    vst1q_s16(coeff + 0, c0);
-    vst1q_s16(coeff + 64, c1);
-    vst1q_s16(coeff + 128, c2);
-    vst1q_s16(coeff + 192, c3);
-
-    coeff += 8;
-  }
-}
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-
-void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-#if HAVE_NEON_ASM
-void aom_lpf_horizontal_8_dual_neon(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif  // HAVE_NEON_ASM
--- a/aom_dsp/arm/sad_media.asm
+++ b/aom_dsp/arm/sad_media.asm
@@ -1,98 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_sad16x16_media|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    const unsigned char *src_ptr
-; r1    int  src_stride
-; r2    const unsigned char *ref_ptr
-; r3    int  ref_stride
-|aom_sad16x16_media| PROC
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-    pld     [r0, r1, lsl #1]
-    pld     [r2, r3, lsl #1]
-
-    mov     r4, #0              ; sad = 0;
-    mov     r5, #8              ; loop count
-
-loop
-    ; 1st row
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
-    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
-    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
-    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
-    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
-    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
-
-    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
-    usad8   r8, r7, r9          ; calculate sad for 4 pixels
-
-    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
-    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
-
-    add     r0, r0, r1          ; set src pointer to next row
-    add     r2, r2, r3          ; set dst pointer to next row
-
-    pld     [r0, r1, lsl #1]
-    pld     [r2, r3, lsl #1]
-
-    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
-    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
-
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
-    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
-    add     r4, r4, r8          ; add partial sad values
-
-    ; 2nd row
-    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
-    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
-    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
-    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
-
-    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
-    usad8   r8, r7, r9          ; calculate sad for 4 pixels
-
-    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
-    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
-
-    add     r0, r0, r1          ; set src pointer to next row
-    add     r2, r2, r3          ; set dst pointer to next row
-
-    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
-    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
-
-    pld     [r0, r1, lsl #1]
-    pld     [r2, r3, lsl #1]
-
-    subs    r5, r5, #1          ; decrement loop counter
-    add     r4, r4, r8          ; add partial sad values
-
-    bne     loop
-
-    mov     r0, r4              ; return sad
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-    END
-
--- a/aom_dsp/arm/save_reg_neon.asm
+++ b/aom_dsp/arm/save_reg_neon.asm
@@ -1,39 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_push_neon|
-    EXPORT  |aom_pop_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-|aom_push_neon| PROC
-    vst1.i64            {d8, d9, d10, d11}, [r0]!
-    vst1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-|aom_pop_neon| PROC
-    vld1.i64            {d8, d9, d10, d11}, [r0]!
-    vld1.i64            {d12, d13, d14, d15}, [r0]!
-    bx              lr
-
-    ENDP
-
-    END
-
--- a/aom_dsp/arm/subpel_variance_media.c
+++ b/aom_dsp/arm/subpel_variance_media.c
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_MEDIA
-static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
-                                                      { 96, 32 }, { 80, 48 },
-                                                      { 64, 64 }, { 48, 80 },
-                                                      { 32, 96 }, { 16, 112 } };
-
-extern void aom_filter_block2d_bil_first_pass_media(
-    const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
-    uint32_t height, uint32_t width, const int16_t *filter);
-
-extern void aom_filter_block2d_bil_second_pass_media(
-    const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
-    uint32_t height, uint32_t width, const int16_t *filter);
-
-unsigned int aom_sub_pixel_variance8x8_media(
-    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
-    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
-  uint16_t first_pass[10 * 8];
-  uint8_t second_pass[8 * 8];
-  const int16_t *HFilter, *VFilter;
-
-  HFilter = bilinear_filters_media[xoffset];
-  VFilter = bilinear_filters_media[yoffset];
-
-  aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
-                                          src_pixels_per_line, 9, 8, HFilter);
-  aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
-                                           VFilter);
-
-  return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
-                               sse);
-}
-
-unsigned int aom_sub_pixel_variance16x16_media(
-    const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
-    const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
-  uint16_t first_pass[36 * 16];
-  uint8_t second_pass[20 * 16];
-  const int16_t *HFilter, *VFilter;
-  unsigned int var;
-
-  if (xoffset == 4 && yoffset == 0) {
-    var = aom_variance_halfpixvar16x16_h_media(
-        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  } else if (xoffset == 0 && yoffset == 4) {
-    var = aom_variance_halfpixvar16x16_v_media(
-        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  } else if (xoffset == 4 && yoffset == 4) {
-    var = aom_variance_halfpixvar16x16_hv_media(
-        src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  } else {
-    HFilter = bilinear_filters_media[xoffset];
-    VFilter = bilinear_filters_media[yoffset];
-
-    aom_filter_block2d_bil_first_pass_media(
-        src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
-    aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
-                                             16, VFilter);
-
-    var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
-                                  sse);
-  }
-  return var;
-}
-#endif  // HAVE_MEDIA
--- a/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm
@@ -1,185 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_variance_halfpixvar16x16_h_media|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|aom_variance_halfpixvar16x16_h_media| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    ldr     r10, c80808080
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     lr, #0              ; constant zero
-loop
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-c80808080
-    DCD     0x80808080
-
-    END
-
--- a/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm
@@ -1,225 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_variance_halfpixvar16x16_hv_media|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|aom_variance_halfpixvar16x16_hv_media| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    ldr     r10, c80808080
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     lr, #0              ; constant zero
-loop
-    add     r9, r0, r1          ; pointer to pixels on the next row
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load source pixels a, row N
-    ldr     r6, [r0, #1]        ; load source pixels b, row N
-    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
-    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load source pixels a, row N
-    ldr     r6, [r0, #5]        ; load source pixels b, row N
-    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load source pixels a, row N
-    ldr     r6, [r0, #9]        ; load source pixels b, row N
-    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load source pixels a, row N
-    ldr     r6, [r0, #13]       ; load source pixels b, row N
-    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
-
-    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
-    mvn     r7, r7
-    uhsub8  r5, r5, r7
-    eor     r5, r5, r10
-    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
-    mvn     r5, r5
-    uhsub8  r4, r4, r5
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-c80808080
-    DCD     0x80808080
-
-    END
--- a/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
+++ b/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm
@@ -1,187 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_variance_halfpixvar16x16_v_media|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|aom_variance_halfpixvar16x16_v_media| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    ldr     r10, c80808080
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     lr, #0              ; constant zero
-loop
-    add     r9, r0, r1          ; set src pointer to next row
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-
-    ; bilinear interpolation
-    mvn     r6, r6
-    uhsub8  r4, r4, r6
-    eor     r4, r4, r10
-
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r6, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r6, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-c80808080
-    DCD     0x80808080
-
-    END
-
--- a/aom_dsp/arm/variance_media.asm
+++ b/aom_dsp/arm/variance_media.asm
@@ -1,361 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-    EXPORT  |aom_variance16x16_media|
-    EXPORT  |aom_variance8x8_media|
-    EXPORT  |aom_mse16x16_media|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|aom_variance16x16_media| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-
-loop16x16
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop16x16
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|aom_variance8x8_media| PROC
-
-    push    {r4-r10, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #8             ; set loop counter to 8 (=block height)
-    mov     r4, #0              ; initialize sum = 0
-    mov     r5, #0              ; initialize sse = 0
-
-loop8x8
-    ; 1st 4 pixels
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels
-    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r6, r7          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-    ; calculate total sum
-    add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r0, #0x4]      ; load 4 src pixels
-    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r6, r7          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1        ; next row
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop8x8
-
-    ; return stuff
-    ldr     r8, [sp, #32]       ; get address of sse
-    mul     r1, r4, r4          ; sum * sum
-    str     r5, [r8]            ; store sse
-    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
-
-    pop     {r4-r10, pc}
-
-    ENDP
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-;
-;note: Based on aom_variance16x16_media. In this function, sum is never used.
-;      So, we can remove this part of calculation.
-
-|aom_mse16x16_media| PROC
-
-    push    {r4-r9, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     r4, #0              ; initialize sse = 0
-
-loopmse
-    ; 1st 4 pixels
-    ldr     r5, [r0, #0x0]      ; load 4 src pixels
-    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r5, r6          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0x4]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-    ldr     r5, [r0, #0x8]      ; load 4 src pixels
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0xc]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    subs    r12, r12, #1        ; next row
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    bne     loopmse
-
-    ; return stuff
-    ldr     r1, [sp, #28]       ; get address of sse
-    mov     r0, r4              ; return sse
-    str     r4, [r1]            ; store sse
-
-    pop     {r4-r9, pc}
-
-    ENDP
-
-    END
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_BITREADER_H_
-#define AOM_DSP_BITREADER_H_
-
-#include <assert.h>
-#include <limits.h>
-
-#include "./aom_config.h"
-#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
-#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
-#endif
-
-#include "aom/aomdx.h"
-#include "aom/aom_integer.h"
-#if CONFIG_ANS
-#include "aom_dsp/ansreader.h"
-#elif CONFIG_DAALA_EC
-#include "aom_dsp/daalaboolreader.h"
-#else
-#include "aom_dsp/dkboolreader.h"
-#endif
-#include "aom_dsp/prob.h"
-#include "av1/common/odintrin.h"
-
-#if CONFIG_ACCOUNTING
-#include "av1/common/accounting.h"
-#define ACCT_STR_NAME acct_str
-#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
-#define ACCT_STR_ARG(s) , s
-#else
-#define ACCT_STR_PARAM
-#define ACCT_STR_ARG(s)
-#endif
-
-#define aom_read(r, prob, ACCT_STR_NAME) \
-  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_bit(r, ACCT_STR_NAME) \
-  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
-  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_literal(r, bits, ACCT_STR_NAME) \
-  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_tree_bits(r, tree, probs, ACCT_STR_NAME) \
-  aom_read_tree_bits_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if CONFIG_ANS
-typedef struct AnsDecoder aom_reader;
-#elif CONFIG_DAALA_EC
-typedef struct daala_reader aom_reader;
-#else
-typedef struct aom_dk_reader aom_reader;
-#endif
-
-static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
-                                  size_t size, aom_decrypt_cb decrypt_cb,
-                                  void *decrypt_state) {
-#if CONFIG_ANS
-  (void)decrypt_cb;
-  (void)decrypt_state;
-  assert(size <= INT_MAX);
-  return ans_read_init(r, buffer, size);
-#elif CONFIG_DAALA_EC
-  (void)decrypt_cb;
-  (void)decrypt_state;
-  return aom_daala_reader_init(r, buffer, size);
-#else
-  return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state);
-#endif
-}
-
-static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "Use the raw buffer size with ANS");
-  return NULL;
-#elif CONFIG_DAALA_EC
-  return aom_daala_reader_find_end(r);
-#else
-  return aom_dk_reader_find_end(r);
-#endif
-}
-
-static INLINE int aom_reader_has_error(aom_reader *r) {
-#if CONFIG_ANS
-  return ans_reader_has_error(r);
-#elif CONFIG_DAALA_EC
-  return aom_daala_reader_has_error(r);
-#else
-  return aom_dk_reader_has_error(r);
-#endif
-}
-
-// Returns the position in the bit reader in bits.
-static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "aom_reader_tell() is unimplemented for ANS");
-  return 0;
-#elif CONFIG_DAALA_EC
-  return aom_daala_reader_tell(r);
-#else
-  return aom_dk_reader_tell(r);
-#endif
-}
-
-// Returns the position in the bit reader in 1/8th bits.
-static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
-#if CONFIG_ANS
-  (void)r;
-  assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
-  return 0;
-#elif CONFIG_DAALA_EC
-  return aom_daala_reader_tell_frac(r);
-#else
-  return aom_dk_reader_tell_frac(r);
-#endif
-}
-
-#if CONFIG_ACCOUNTING
-static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
-  if (r->accounting != NULL) {
-    uint32_t tell_frac;
-    tell_frac = aom_reader_tell_frac(r);
-    aom_accounting_record(r->accounting, ACCT_STR_NAME,
-                          tell_frac - r->accounting->last_tell_frac);
-    r->accounting->last_tell_frac = tell_frac;
-  }
-}
-#endif
-
-static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
-  int ret;
-#if CONFIG_ANS
-  ret = uabs_read(r, prob);
-#elif CONFIG_DAALA_EC
-  ret = aom_daala_read(r, prob);
-#else
-  ret = aom_dk_read(r, prob);
-#endif
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return ret;
-}
-
-static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
-  int ret;
-#if CONFIG_ANS
-  ret = uabs_read_bit(r);  // Non trivial optimization at half probability
-#else
-  ret = aom_read(r, 128, NULL);  // aom_prob_half
-#endif
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return ret;
-}
-
-static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
-  int literal = 0, bit;
-
-  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return literal;
-}
-
-static INLINE int aom_read_tree_bits_(aom_reader *r, const aom_tree_index *tree,
-                                      const aom_prob *probs ACCT_STR_PARAM) {
-  aom_tree_index i = 0;
-
-  while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue;
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return -i;
-}
-
-static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
-                                 const aom_prob *probs ACCT_STR_PARAM) {
-  int ret;
-#if CONFIG_DAALA_EC
-  ret = daala_read_tree_bits(r, tree, probs);
-#else
-  ret = aom_read_tree_bits(r, tree, probs, NULL);
-#endif
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return ret;
-}
-
-#if CONFIG_EC_MULTISYMBOL
-static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
-                                   int nsymbs ACCT_STR_PARAM) {
-  int ret;
-#if CONFIG_RANS
-  (void)nsymbs;
-  ret = rans_read(r, cdf);
-#elif CONFIG_DAALA_EC
-  ret = daala_read_symbol(r, cdf, nsymbs);
-#else
-#error \
-    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
-  "coder. Enable daala_ec or ans for a valid configuration."
-#endif
-
-#if CONFIG_EC_ADAPT
-  update_cdf(cdf, ret, nsymbs);
-#endif
-
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return ret;
-}
-#endif  // CONFIG_EC_MULTISYMBOL
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_BITREADER_H_
--- a/aom_dsp/bitreader_buffer.c
+++ b/aom_dsp/bitreader_buffer.c
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "./aom_config.h"
-#include "./bitreader_buffer.h"
-
-size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
-  return (rb->bit_offset + 7) >> 3;
-}
-
-int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
-  const size_t off = rb->bit_offset;
-  const size_t p = off >> 3;
-  const int q = 7 - (int)(off & 0x7);
-  if (rb->bit_buffer + p < rb->bit_buffer_end) {
-    const int bit = (rb->bit_buffer[p] >> q) & 1;
-    rb->bit_offset = off + 1;
-    return bit;
-  } else {
-    rb->error_handler(rb->error_handler_data);
-    return 0;
-  }
-}
-
-int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
-  int value = 0, bit;
-  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
-  return value;
-}
-
-int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
-  const int value = aom_rb_read_literal(rb, bits);
-  return aom_rb_read_bit(rb) ? -value : value;
-}
-
-int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
-  const int nbits = sizeof(unsigned) * 8 - bits - 1;
-  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
-  return ((int)value) >> nbits;
-}
--- a/aom_dsp/bitreader_buffer.h
+++ b/aom_dsp/bitreader_buffer.h
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_BITREADER_BUFFER_H_
-#define AOM_DSP_BITREADER_BUFFER_H_
-
-#include <limits.h>
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*aom_rb_error_handler)(void *data);
-
-struct aom_read_bit_buffer {
-  const uint8_t *bit_buffer;
-  const uint8_t *bit_buffer_end;
-  size_t bit_offset;
-
-  void *error_handler_data;
-  aom_rb_error_handler error_handler;
-};
-
-size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
-
-int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
-
-int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
-
-int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
-
-int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_BITREADER_BUFFER_H_
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_BITWRITER_H_
-#define AOM_DSP_BITWRITER_H_
-
-#include <assert.h>
-#include "./aom_config.h"
-#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
-#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
-#endif
-
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#elif CONFIG_DAALA_EC
-#include "aom_dsp/daalaboolwriter.h"
-#else
-#include "aom_dsp/dkboolwriter.h"
-#endif
-#include "aom_dsp/prob.h"
-
-#if CONFIG_RD_DEBUG
-#include "av1/encoder/cost.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if CONFIG_ANS
-typedef struct BufAnsCoder aom_writer;
-#elif CONFIG_DAALA_EC
-typedef struct daala_writer aom_writer;
-#else
-typedef struct aom_dk_writer aom_writer;
-#endif
-
-typedef struct TOKEN_STATS { int64_t cost; } TOKEN_STATS;
-
-static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
-#if CONFIG_ANS
-  (void)bc;
-  (void)buffer;
-  assert(0 && "buf_ans requires a more complicated startup procedure");
-#elif CONFIG_DAALA_EC
-  aom_daala_start_encode(bc, buffer);
-#else
-  aom_dk_start_encode(bc, buffer);
-#endif
-}
-
-static INLINE void aom_stop_encode(aom_writer *bc) {
-#if CONFIG_ANS
-  (void)bc;
-  assert(0 && "buf_ans requires a more complicated shutdown procedure");
-#elif CONFIG_DAALA_EC
-  aom_daala_stop_encode(bc);
-#else
-  aom_dk_stop_encode(bc);
-#endif
-}
-
-static INLINE void aom_write(aom_writer *br, int bit, int probability) {
-#if CONFIG_ANS
-  buf_uabs_write(br, bit, probability);
-#elif CONFIG_DAALA_EC
-  aom_daala_write(br, bit, probability);
-#else
-  aom_dk_write(br, bit, probability);
-#endif
-}
-
-static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
-                                    TOKEN_STATS *token_stats) {
-  aom_write(br, bit, probability);
-#if CONFIG_RD_DEBUG
-  token_stats->cost += av1_cost_bit(probability, bit);
-#else
-  (void)token_stats;
-#endif
-}
-
-static INLINE void aom_write_bit(aom_writer *w, int bit) {
-  aom_write(w, bit, 128);  // aom_prob_half
-}
-
-static INLINE void aom_write_bit_record(aom_writer *w, int bit,
-                                        TOKEN_STATS *token_stats) {
-  aom_write_record(w, bit, 128, token_stats);  // aom_prob_half
-}
-
-static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
-  int bit;
-
-  for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
-}
-
-static INLINE void aom_write_tree_bits(aom_writer *w, const aom_tree_index *tr,
-                                       const aom_prob *probs, int bits, int len,
-                                       aom_tree_index i) {
-  do {
-    const int bit = (bits >> --len) & 1;
-    aom_write(w, bit, probs[i >> 1]);
-    i = tr[i + bit];
-  } while (len);
-}
-
-static INLINE void aom_write_tree_bits_record(aom_writer *w,
-                                              const aom_tree_index *tr,
-                                              const aom_prob *probs, int bits,
-                                              int len, aom_tree_index i,
-                                              TOKEN_STATS *token_stats) {
-  do {
-    const int bit = (bits >> --len) & 1;
-    aom_write_record(w, bit, probs[i >> 1], token_stats);
-    i = tr[i + bit];
-  } while (len);
-}
-
-static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
-                                  const aom_prob *probs, int bits, int len,
-                                  aom_tree_index i) {
-#if CONFIG_DAALA_EC
-  daala_write_tree_bits(w, tree, probs, bits, len, i);
-#else
-  aom_write_tree_bits(w, tree, probs, bits, len, i);
-#endif
-}
-
-static INLINE void aom_write_tree_record(aom_writer *w,
-                                         const aom_tree_index *tree,
-                                         const aom_prob *probs, int bits,
-                                         int len, aom_tree_index i,
-                                         TOKEN_STATS *token_stats) {
-#if CONFIG_DAALA_EC
-  (void)token_stats;
-  daala_write_tree_bits(w, tree, probs, bits, len, i);
-#else
-  aom_write_tree_bits_record(w, tree, probs, bits, len, i, token_stats);
-#endif
-}
-
-#if CONFIG_EC_MULTISYMBOL
-static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
-                                    int nsymbs) {
-#if CONFIG_RANS
-  struct rans_sym s;
-  (void)nsymbs;
-  assert(cdf);
-  s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
-  s.prob = cdf[symb] - s.cum_prob;
-  buf_rans_write(w, &s);
-#elif CONFIG_DAALA_EC
-  daala_write_symbol(w, symb, cdf, nsymbs);
-#else
-#error \
-    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
-  "coder. Enable daala_ec or ans for a valid configuration."
-#endif
-
-#if CONFIG_EC_ADAPT
-  update_cdf(cdf, symb, nsymbs);
-#endif
-}
-#endif  // CONFIG_EC_MULTISYMBOL
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_BITWRITER_H_
--- a/aom_dsp/bitwriter_buffer.c
+++ b/aom_dsp/bitwriter_buffer.c
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <limits.h>
-#include <stdlib.h>
-
-#include "./aom_config.h"
-#include "./bitwriter_buffer.h"
-
-size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
-  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
-}
-
-void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
-  const int off = (int)wb->bit_offset;
-  const int p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
-  if (q == CHAR_BIT - 1) {
-    wb->bit_buffer[p] = bit << q;
-  } else {
-    wb->bit_buffer[p] &= ~(1 << q);
-    wb->bit_buffer[p] |= bit << q;
-  }
-  wb->bit_offset = off + 1;
-}
-
-void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
-}
-
-void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
-                                     int bits) {
-  aom_wb_write_literal(wb, data, bits + 1);
-}
--- a/aom_dsp/bitwriter_buffer.h
+++ b/aom_dsp/bitwriter_buffer.h
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_BITWRITER_BUFFER_H_
-#define AOM_DSP_BITWRITER_BUFFER_H_
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct aom_write_bit_buffer {
-  uint8_t *bit_buffer;
-  size_t bit_offset;
-};
-
-size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
-
-void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
-
-void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
-
-void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
-                                     int bits);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_BITWRITER_BUFFER_H_
--- a/aom_dsp/blend.h
+++ b/aom_dsp/blend.h
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_BLEND_H_
-#define AOM_DSP_BLEND_H_
-
-#include "aom_ports/mem.h"
-
-// Various blending functions and macros.
-// See also the aom_blend_* functions in aom_dsp_rtcd.h
-
-// Alpha blending with alpha values from the range [0, 64], where 64
-// means use the first input and 0 means use the second input.
-
-#define AOM_BLEND_A64_ROUND_BITS 6
-#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
-
-#define AOM_BLEND_A64(a, v0, v1)                                          \
-  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
-                     AOM_BLEND_A64_ROUND_BITS)
-
-// Alpha blending with alpha values from the range [0, 256], where 256
-// means use the first input and 0 means use the second input.
-#define AOM_BLEND_A256_ROUND_BITS 8
-#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS)  // 256
-
-#define AOM_BLEND_A256(a, v0, v1)                                          \
-  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
-                     AOM_BLEND_A256_ROUND_BITS)
-
-// Blending by averaging.
-#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
-
-#endif  // AOM_DSP_BLEND_H_
--- a/aom_dsp/blend_a64_hmask.c
+++ b/aom_dsp/blend_a64_hmask.c
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "./aom_dsp_rtcd.h"
-
-void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(
-          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
-    }
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
-                                  const uint8_t *src0_8, uint32_t src0_stride,
-                                  const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int h, int w, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-  (void)bd;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(
-          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
-    }
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/blend_a64_mask.c
+++ b/aom_dsp/blend_a64_mask.c
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-#include "./aom_dsp_rtcd.h"
-
-// Blending with alpha mask. Mask values come from the range [0, 64],
-// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
-// be the same as dst, or dst can be different from both sources.
-
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
-                          const uint8_t *src0, uint32_t src0_stride,
-                          const uint8_t *src1, uint32_t src1_stride,
-                          const uint8_t *mask, uint32_t mask_stride, int h,
-                          int w, int subh, int subw) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = ROUND_POWER_OF_TWO(
-            mask[(2 * i) * mask_stride + (2 * j)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-            2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
-                                    mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
-                                    mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
-                                 const uint8_t *src0_8, uint32_t src0_stride,
-                                 const uint8_t *src1_8, uint32_t src1_stride,
-                                 const uint8_t *mask, uint32_t mask_stride,
-                                 int h, int w, int subh, int subw, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-  (void)bd;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = ROUND_POWER_OF_TWO(
-            mask[(2 * i) * mask_stride + (2 * j)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-            2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
-                                    mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
-                                    mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/blend_a64_vmask.c
+++ b/aom_dsp/blend_a64_vmask.c
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "./aom_dsp_rtcd.h"
-
-void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  for (i = 0; i < h; ++i) {
-    const int m = mask[i];
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                              src1[i * src1_stride + j]);
-    }
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
-                                  const uint8_t *src0_8, uint32_t src0_stride,
-                                  const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int h, int w, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-  (void)bd;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  for (i = 0; i < h; ++i) {
-    const int m = mask[i];
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                              src1[i * src1_stride + j]);
-    }
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/buf_ans.c
+++ b/aom_dsp/buf_ans.c
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-
-#include "aom_dsp/buf_ans.h"
-#include "aom_mem/aom_mem.h"
-#include "aom/internal/aom_codec_internal.h"
-
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error, int size_hint) {
-  c->error = error;
-  c->size = size_hint;
-  AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
-  // Initialize to overfull to trigger the assert in write.
-  c->offset = c->size + 1;
-}
-
-void aom_buf_ans_free(struct BufAnsCoder *c) {
-  aom_free(c->buf);
-  c->buf = NULL;
-  c->size = 0;
-}
-
-void aom_buf_ans_grow(struct BufAnsCoder *c) {
-  struct buffered_ans_symbol *new_buf = NULL;
-  int new_size = c->size * 2;
-  AOM_CHECK_MEM_ERROR(c->error, new_buf,
-                      aom_malloc(new_size * sizeof(*new_buf)));
-  memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
-  aom_free(c->buf);
-  c->buf = new_buf;
-  c->size = new_size;
-}
--- a/aom_dsp/buf_ans.h
+++ b/aom_dsp/buf_ans.h
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_BUF_ANS_H_
-#define AOM_DSP_BUF_ANS_H_
-// Buffered forward ANS writer.
-// Symbols are written to the writer in forward (decode) order and serialized
-// backwards due to ANS's stack like behavior.
-
-#include <assert.h>
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/answriter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-#define ANS_METHOD_UABS 0
-#define ANS_METHOD_RANS 1
-
-struct buffered_ans_symbol {
-  unsigned int method : 1;  // one of ANS_METHOD_UABS or ANS_METHOD_RANS
-  // TODO(aconverse): Should be possible to write this in terms of start for ABS
-  unsigned int val_start : RANS_PROB_BITS;  // Boolean value for ABS
-                                            // start in symbol cycle for Rans
-  unsigned int prob : RANS_PROB_BITS;       // Probability of this symbol
-};
-
-struct BufAnsCoder {
-  struct aom_internal_error_info *error;
-  struct buffered_ans_symbol *buf;
-  int size;
-  int offset;
-};
-
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error, int size_hint);
-
-void aom_buf_ans_free(struct BufAnsCoder *c);
-
-void aom_buf_ans_grow(struct BufAnsCoder *c);
-
-static INLINE void buf_ans_write_reset(struct BufAnsCoder *const c) {
-  c->offset = 0;
-}
-
-static INLINE void buf_uabs_write(struct BufAnsCoder *const c, uint8_t val,
-                                  AnsP8 prob) {
-  assert(c->offset <= c->size);
-  if (c->offset == c->size) {
-    aom_buf_ans_grow(c);
-  }
-  c->buf[c->offset].method = ANS_METHOD_UABS;
-  c->buf[c->offset].val_start = val;
-  c->buf[c->offset].prob = prob;
-  ++c->offset;
-}
-
-static INLINE void buf_rans_write(struct BufAnsCoder *const c,
-                                  const struct rans_sym *const sym) {
-  assert(c->offset <= c->size);
-  if (c->offset == c->size) {
-    aom_buf_ans_grow(c);
-  }
-  c->buf[c->offset].method = ANS_METHOD_RANS;
-  c->buf[c->offset].val_start = sym->cum_prob;
-  c->buf[c->offset].prob = sym->prob;
-  ++c->offset;
-}
-
-static INLINE void buf_ans_flush(const struct BufAnsCoder *const c,
-                                 struct AnsCoder *ans) {
-  int offset;
-  for (offset = c->offset - 1; offset >= 0; --offset) {
-    if (c->buf[offset].method == ANS_METHOD_RANS) {
-      struct rans_sym sym;
-      sym.prob = c->buf[offset].prob;
-      sym.cum_prob = c->buf[offset].val_start;
-      rans_write(ans, &sym);
-    } else {
-      uabs_write(ans, (uint8_t)c->buf[offset].val_start,
-                 (AnsP8)c->buf[offset].prob);
-    }
-  }
-}
-
-static INLINE void buf_uabs_write_bit(struct BufAnsCoder *c, int bit) {
-  buf_uabs_write(c, bit, 128);
-}
-
-static INLINE void buf_uabs_write_literal(struct BufAnsCoder *c, int literal,
-                                          int bits) {
-  int bit;
-
-  assert(bits < 31);
-  for (bit = bits - 1; bit >= 0; bit--)
-    buf_uabs_write_bit(c, 1 & (literal >> bit));
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_DSP_BUF_ANS_H_
--- a/aom_dsp/daalaboolreader.c
+++ b/aom_dsp/daalaboolreader.c
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/daalaboolreader.h"
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
-  if (size && !buffer) {
-    return 1;
-  }
-  r->buffer_end = buffer + size;
-  r->buffer = buffer;
-  od_ec_dec_init(&r->ec, buffer, size - 1);
-#if CONFIG_ACCOUNTING
-  r->accounting = NULL;
-#endif
-  return 0;
-}
-
-const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
-  return r->buffer_end;
-}
-
-uint32_t aom_daala_reader_tell(const daala_reader *r) {
-  return od_ec_dec_tell(&r->ec);
-}
-
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
-  return od_ec_dec_tell_frac(&r->ec);
-}
--- a/aom_dsp/daalaboolreader.h
+++ b/aom_dsp/daalaboolreader.h
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_DAALABOOLREADER_H_
-#define AOM_DSP_DAALABOOLREADER_H_
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/entdec.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_ACCOUNTING
-#include "av1/common/accounting.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_reader {
-  const uint8_t *buffer;
-  const uint8_t *buffer_end;
-  od_ec_dec ec;
-#if CONFIG_ACCOUNTING
-  Accounting *accounting;
-#endif
-};
-
-typedef struct daala_reader daala_reader;
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
-const uint8_t *aom_daala_reader_find_end(daala_reader *r);
-uint32_t aom_daala_reader_tell(const daala_reader *r);
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
-
-static INLINE int aom_daala_read(daala_reader *r, int prob) {
-  if (prob == 128) {
-    return od_ec_dec_bits(&r->ec, 1, "aom_bits");
-  } else {
-    int p = ((prob << 15) + (256 - prob)) >> 8;
-    return od_ec_decode_bool_q15(&r->ec, p);
-  }
-}
-
-static INLINE int aom_daala_read_bit(daala_reader *r) {
-  return aom_daala_read(r, 128);
-}
-
-static INLINE int aom_daala_reader_has_error(daala_reader *r) {
-  return r->ec.error;
-}
-
-static INLINE int daala_read_tree_bits(daala_reader *r,
-                                       const aom_tree_index *tree,
-                                       const aom_prob *probs) {
-  aom_tree_index i = 0;
-  do {
-    aom_cdf_prob cdf[16];
-    aom_tree_index index[16];
-    int path[16];
-    int dist[16];
-    int nsymbs;
-    int symb;
-    nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist);
-    symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
-    OD_ASSERT(symb >= 0 && symb < nsymbs);
-    i = index[symb];
-  } while (i > 0);
-  return -i;
-}
-
-static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
-                                    int nsymbs) {
-  return od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif
--- a/aom_dsp/daalaboolwriter.c
+++ b/aom_dsp/daalaboolwriter.c
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include "aom_dsp/daalaboolwriter.h"
-
-void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
-  br->buffer = source;
-  br->pos = 0;
-  od_ec_enc_init(&br->ec, 62025);
-}
-
-void aom_daala_stop_encode(daala_writer *br) {
-  uint32_t daala_bytes;
-  unsigned char *daala_data;
-  daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
-  memcpy(br->buffer, daala_data, daala_bytes);
-  br->pos = daala_bytes;
-  /* Prevent ec bitstream from being detected as a superframe marker.
-     Must always be added, so that rawbits knows the exact length of the
-      bitstream. */
-  br->buffer[br->pos++] = 0;
-  od_ec_enc_clear(&br->ec);
-}
--- a/aom_dsp/daalaboolwriter.h
+++ b/aom_dsp/daalaboolwriter.h
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_DAALABOOLWRITER_H_
-#define AOM_DSP_DAALABOOLWRITER_H_
-
-#include "aom_dsp/entenc.h"
-#include "aom_dsp/prob.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_writer {
-  unsigned int pos;
-  uint8_t *buffer;
-  od_ec_enc ec;
-};
-
-typedef struct daala_writer daala_writer;
-
-void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
-void aom_daala_stop_encode(daala_writer *w);
-
-static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
-  if (prob == 128) {
-    od_ec_enc_bits(&w->ec, bit, 1);
-  } else {
-    int p = ((prob << 15) + (256 - prob)) >> 8;
-    od_ec_encode_bool_q15(&w->ec, bit, p);
-  }
-}
-
-static INLINE void daala_write_tree_bits(daala_writer *w,
-                                         const aom_tree_index *tree,
-                                         const aom_prob *probs, int bits,
-                                         int len, aom_tree_index i) {
-  aom_tree_index root;
-  root = i;
-  do {
-    aom_cdf_prob cdf[16];
-    aom_tree_index index[16];
-    int path[16];
-    int dist[16];
-    int nsymbs;
-    int symb;
-    int j;
-    /* Compute the CDF of the binary tree using the given probabilities. */
-    nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist);
-    /* Find the symbol to code. */
-    symb = -1;
-    for (j = 0; j < nsymbs; j++) {
-      /* If this symbol codes a leaf node,  */
-      if (index[j] <= 0) {
-        if (len == dist[j] && path[j] == bits) {
-          symb = j;
-          break;
-        }
-      } else {
-        if (len > dist[j] && path[j] == bits >> (len - dist[j])) {
-          symb = j;
-          break;
-        }
-      }
-    }
-    OD_ASSERT(symb != -1);
-    od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
-    bits &= (1 << (len - dist[symb])) - 1;
-    len -= dist[symb];
-  } while (len);
-}
-
-static INLINE void daala_write_symbol(daala_writer *w, int symb,
-                                      const aom_cdf_prob *cdf, int nsymbs) {
-  od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif
--- a/aom_dsp/dkboolreader.h
+++ b/aom_dsp/dkboolreader.h
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_DKBOOLREADER_H_
-#define AOM_DSP_DKBOOLREADER_H_
-
-#include <assert.h>
-#include <stddef.h>
-#include <limits.h>
-
-#include "./aom_config.h"
-#if CONFIG_BITSTREAM_DEBUG
-#include <assert.h>
-#include <stdio.h>
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-#include "aom_ports/mem.h"
-#include "aom/aomdx.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_ACCOUNTING
-#include "av1/common/accounting.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef size_t BD_VALUE;
-
-#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
-
-// This is meant to be a large, positive constant that can still be efficiently
-// loaded as an immediate (on platforms like ARM, for example).
-// Even relatively modest values like 100 would work fine.
-#define LOTS_OF_BITS 0x40000000
-
-struct aom_dk_reader {
-  // Be careful when reordering this struct, it may impact the cache negatively.
-  BD_VALUE value;
-  unsigned int range;
-  int count;
-  const uint8_t *buffer_start;
-  const uint8_t *buffer_end;
-  const uint8_t *buffer;
-  aom_decrypt_cb decrypt_cb;
-  void *decrypt_state;
-  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
-#if CONFIG_ACCOUNTING
-  Accounting *accounting;
-#endif
-};
-
-int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer,
-                       size_t size, aom_decrypt_cb decrypt_cb,
-                       void *decrypt_state);
-
-void aom_dk_reader_fill(struct aom_dk_reader *r);
-
-const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r);
-
-static INLINE uint32_t aom_dk_reader_tell(const struct aom_dk_reader *r) {
-  const uint32_t bits_read = (r->buffer - r->buffer_start) * CHAR_BIT;
-  const int count =
-      (r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS;
-  assert(r->buffer >= r->buffer_start);
-  return bits_read - (count + CHAR_BIT);
-}
-
-/*The resolution of fractional-precision bit usage measurements, i.e.,
-   3 => 1/8th bits.*/
-#define DK_BITRES (3)
-
-static INLINE uint32_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) {
-  uint32_t num_bits;
-  uint32_t range;
-  int l;
-  int i;
-  num_bits = aom_dk_reader_tell(r) << DK_BITRES;
-  range = r->range;
-  l = 0;
-  for (i = DK_BITRES; i-- > 0;) {
-    int b;
-    range = range * range >> 7;
-    b = (int)(range >> 8);
-    l = l << 1 | b;
-    range >>= b;
-  }
-  return num_bits - l;
-}
-
-static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) {
-  // Check if we have reached the end of the buffer.
-  //
-  // Variable 'count' stores the number of bits in the 'value' buffer, minus
-  // 8. The top byte is part of the algorithm, and the remainder is buffered
-  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
-  // occupied, 8 for the algorithm and 8 in the buffer.
-  //
-  // When reading a byte from the user's buffer, count is filled with 8 and
-  // one byte is filled into the value buffer. When we reach the end of the
-  // data, count is additionally filled with LOTS_OF_BITS. So when
-  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
-  //
-  // 1 if we have tried to decode bits after the end of stream was encountered.
-  // 0 No error.
-  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
-}
-
-static INLINE int aom_dk_read(struct aom_dk_reader *r, int prob) {
-  unsigned int bit = 0;
-  BD_VALUE value;
-  BD_VALUE bigsplit;
-  int count;
-  unsigned int range;
-  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
-
-  if (r->count < 0) aom_dk_reader_fill(r);
-
-  value = r->value;
-  count = r->count;
-
-  bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
-
-  range = split;
-
-  if (value >= bigsplit) {
-    range = r->range - split;
-    value = value - bigsplit;
-    bit = 1;
-  }
-
-  {
-    register int shift = aom_norm[range];
-    range <<= shift;
-    value <<= shift;
-    count -= shift;
-  }
-  r->value = value;
-  r->count = count;
-  r->range = range;
-
-#if CONFIG_BITSTREAM_DEBUG
-  {
-    int ref_bit, ref_prob;
-    const int queue_r = bitstream_queue_get_read();
-    const int frame_idx = bitstream_queue_get_frame_read();
-    bitstream_queue_pop(&ref_bit, &ref_prob);
-    if (prob != ref_prob) {
-      fprintf(
-          stderr,
-          "\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n",
-          frame_idx, prob, ref_prob, queue_r);
-      assert(0);
-    }
-    if ((int)bit != ref_bit) {
-      fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n",
-              frame_idx, bit, ref_bit);
-      assert(0);
-    }
-  }
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-  return bit;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_DKBOOLREADER_H_
--- a/aom_dsp/dkboolwriter.c
+++ b/aom_dsp/dkboolwriter.c
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "./dkboolwriter.h"
-
-static INLINE void aom_dk_write_bit(aom_dk_writer *w, int bit) {
-  aom_dk_write(w, bit, 128);  // aom_prob_half
-}
-
-void aom_dk_start_encode(aom_dk_writer *br, uint8_t *source) {
-  br->lowvalue = 0;
-  br->range = 255;
-  br->count = -24;
-  br->buffer = source;
-  br->pos = 0;
-  aom_dk_write_bit(br, 0);
-}
-
-void aom_dk_stop_encode(aom_dk_writer *br) {
-  int i;
-
-#if CONFIG_BITSTREAM_DEBUG
-  bitstream_queue_set_skip_write(1);
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-  for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0);
-
-#if CONFIG_BITSTREAM_DEBUG
-  bitstream_queue_set_skip_write(0);
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-  // Ensure there's no ambigous collision with any index marker bytes
-  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
-}
--- a/aom_dsp/dkboolwriter.h
+++ b/aom_dsp/dkboolwriter.h
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_DKBOOLWRITER_H_
-#define AOM_DSP_DKBOOLWRITER_H_
-
-#include "./aom_config.h"
-
-#if CONFIG_BITSTREAM_DEBUG
-#include <stdio.h>
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-#include "aom_dsp/prob.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct aom_dk_writer {
-  unsigned int lowvalue;
-  unsigned int range;
-  int count;
-  unsigned int pos;
-  uint8_t *buffer;
-} aom_dk_writer;
-
-void aom_dk_start_encode(aom_dk_writer *bc, uint8_t *buffer);
-void aom_dk_stop_encode(aom_dk_writer *bc);
-
-static INLINE void aom_dk_write(aom_dk_writer *br, int bit, int probability) {
-  unsigned int split;
-  int count = br->count;
-  unsigned int range = br->range;
-  unsigned int lowvalue = br->lowvalue;
-  register int shift;
-
-#if CONFIG_BITSTREAM_DEBUG
-  // int queue_r = 0;
-  // int frame_idx_r = 0;
-  // int queue_w = bitstream_queue_get_write();
-  // int frame_idx_w = bitstream_queue_get_frame_write();
-  // if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-  //   fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-  //   frame_idx_w, queue_w);
-  // }
-  bitstream_queue_push(bit, probability);
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-  split = 1 + (((range - 1) * probability) >> 8);
-
-  range = split;
-
-  if (bit) {
-    lowvalue += split;
-    range = br->range - split;
-  }
-
-  shift = aom_norm[range];
-
-  range <<= shift;
-  count += shift;
-
-  if (count >= 0) {
-    int offset = shift - count;
-
-    if ((lowvalue << (offset - 1)) & 0x80000000) {
-      int x = br->pos - 1;
-
-      while (x >= 0 && br->buffer[x] == 0xff) {
-        br->buffer[x] = 0;
-        x--;
-      }
-
-      br->buffer[x] += 1;
-    }
-
-    br->buffer[br->pos++] = (lowvalue >> (24 - offset));
-    lowvalue <<= offset;
-    shift = count;
-    lowvalue &= 0xffffff;
-    count -= 8;
-  }
-
-  lowvalue <<= shift;
-  br->count = count;
-  br->lowvalue = lowvalue;
-  br->range = range;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_DKBOOLWRITER_H_
--- a/aom_dsp/entcode.c
+++ b/aom_dsp/entcode.c
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
-#include "aom_dsp/entcode.h"
-
-/*CDFs for uniform probability distributions of small sizes (2 through 16,
-   inclusive).*/
-// clang-format off
-const uint16_t OD_UNIFORM_CDFS_Q15[135] = {
-  16384, 32768,
-  10923, 21845, 32768,
-  8192,  16384, 24576, 32768,
-  6554,  13107, 19661, 26214, 32768,
-  5461,  10923, 16384, 21845, 27307, 32768,
-  4681,   9362, 14043, 18725, 23406, 28087, 32768,
-  4096,   8192, 12288, 16384, 20480, 24576, 28672, 32768,
-  3641,   7282, 10923, 14564, 18204, 21845, 25486, 29127, 32768,
-  3277,   6554,  9830, 13107, 16384, 19661, 22938, 26214, 29491, 32768,
-  2979,   5958,  8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789, 32768,
-  2731,   5461,  8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037,
-  32768,
-  2521,   5041,  7562, 10082, 12603, 15124, 17644, 20165, 22686, 25206, 27727,
-  30247, 32768,
-  2341,   4681,  7022,  9362, 11703, 14043, 16384, 18725, 21065, 23406, 25746,
-  28087, 30427, 32768,
-  2185,   4369,  6554,  8738, 10923, 13107, 15292, 17476, 19661, 21845, 24030,
-  26214, 28399, 30583, 32768,
-  2048,   4096,  6144,  8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528,
-  24576, 26624, 28672, 30720, 32768
-};
-// clang-format on
-
-/*Given the current total integer number of bits used and the current value of
-   rng, computes the fraction number of bits used to OD_BITRES precision.
-  This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
-  nbits_total: The number of whole bits currently used, i.e., the value
-                returned by od_ec_enc_tell() or od_ec_dec_tell().
-  rng: The current value of rng from either the encoder or decoder state.
-  Return: The number of bits scaled by 2**OD_BITRES.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
-  uint32_t nbits;
-  int l;
-  int i;
-  /*To handle the non-integral number of bits still left in the encoder/decoder
-     state, we compute the worst-case number of bits of val that must be
-     encoded to ensure that the value is inside the range for any possible
-     subsequent bits.
-    The computation here is independent of val itself (the decoder does not
-     even track that value), even though the real number of bits used after
-     od_ec_enc_done() may be 1 smaller if rng is a power of two and the
-     corresponding trailing bits of val are all zeros.
-    If we did try to track that special case, then coding a value with a
-     probability of 1/(1 << n) might sometimes appear to use more than n bits.
-    This may help explain the surprising result that a newly initialized
-     encoder or decoder claims to have used 1 bit.*/
-  nbits = nbits_total << OD_BITRES;
-  l = 0;
-  for (i = OD_BITRES; i-- > 0;) {
-    int b;
-    rng = rng * rng >> 15;
-    b = (int)(rng >> 16);
-    l = l << 1 | b;
-    rng >>= b;
-  }
-  return nbits - l;
-}
--- a/aom_dsp/entcode.h
+++ b/aom_dsp/entcode.h
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#if !defined(_entcode_H)
-#define _entcode_H (1)
-#include <limits.h>
-#include <stddef.h>
-#include "av1/common/odintrin.h"
-
-/*Set this flag 1 to enable a "reduced overhead" version of the entropy coder.
-  This uses a partition function that more accurately follows the input
-   probability estimates at the expense of some additional CPU cost (though
-   still an order of magnitude less than a full division).
-
-  In classic arithmetic coding, the partition function maps a value x in the
-   range [0, ft] to a value in y in [0, r] with 0 < ft <= r via
-    y = x*r/ft.
-  Any deviation from this value increases coding inefficiency.
-
-  To avoid divisions, we require ft <= r < 2*ft (enforcing it by shifting up
-   ft if necessary), and replace that function with
-    y = x + OD_MINI(x, r - ft).
-  This counts values of x smaller than r - ft double compared to values larger
-   than r - ft, which over-estimates the probability of symbols at the start of
-   the alphabet, and under-estimates the probability of symbols at the end of
-   the alphabet.
-  The overall coding inefficiency assuming accurate probability models and
-   independent symbols is in the 1% range, which is similar to that of CABAC.
-
-  To reduce overhead even further, we split this into two cases:
-  1) r - ft > ft - (r - ft).
-     That is, we have more values of x that are double-counted than
-      single-counted.
-     In this case, we still double-count the first 2*r - 3*ft values of x, but
-      after that we alternate between single-counting and double-counting for
-      the rest.
-  2) r - ft < ft - (r - ft).
-     That is, we have more values of x that are single-counted than
-      double-counted.
-     In this case, we alternate between single-counting and double-counting for
-      the first 2*(r - ft) values of x, and single-count the rest.
-  For two equiprobable symbols in different places in the alphabet, this
-   reduces the maximum ratio of over-estimation to under-estimation from 2:1
-   for the previous partition function to either 4:3 or 3:2 (for each of the
-   two cases above, respectively), assuming symbol probabilities significantly
-   greater than 1/32768.
-  That reduces the worst-case per-symbol overhead from 1 bit to 0.58 bits.
-
-  The resulting function is
-    e = OD_MAXI(2*r - 3*ft, 0);
-    y = x + OD_MINI(x, e) + OD_MINI(OD_MAXI(x - e, 0) >> 1, r - ft).
-  Here, e is a value that is greater than 0 in case 1, and 0 in case 2.
-  This function is about 3 times as expensive to evaluate as the high-overhead
-   version, but still an order of magnitude cheaper than a division, since it
-   is composed only of very simple operations.
-  Because we want to fit in 16-bit registers and must use unsigned values to do
-   so, we use saturating subtraction to enforce the maximums with 0.
-
-  Enabling this reduces the measured overhead in ectest from 0.805% to 0.621%
-   (vs. 0.022% for the division-based partition function with r much greater
-   than ft).
-  It improves performance on ntt-short-1 by about 0.3%.*/
-#define OD_EC_REDUCED_OVERHEAD (1)
-
-/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
-   on a larger type, you can speed up the decoder by using it here.*/
-typedef uint32_t od_ec_window;
-
-#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
-
-/*Unsigned subtraction with unsigned saturation.
-  This implementation of the macro is intentionally chosen to increase the
-   number of common subexpressions in the reduced-overhead partition function.
-  This matters for C code, but it would not for hardware with a saturating
-   subtraction instruction.*/
-#define OD_SUBSATU(a, b) ((a)-OD_MINI(a, b))
-
-/*The number of bits to use for the range-coded part of unsigned integers.*/
-#define OD_EC_UINT_BITS (4)
-
-/*The resolution of fractional-precision bit usage measurements, i.e.,
-   3 => 1/8th bits.*/
-#define OD_BITRES (3)
-
-extern const uint16_t OD_UNIFORM_CDFS_Q15[135];
-
-/*Returns a Q15 CDF for a uniform probability distribution of the given size.
-  n: The size of the distribution.
-     This must be at least 2, and no more than 16.*/
-#define OD_UNIFORM_CDF_Q15(n) (OD_UNIFORM_CDFS_Q15 + ((n) * ((n)-1) >> 1) - 1)
-
-/*See entcode.c for further documentation.*/
-
-OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
-                                               uint32_t rng);
-
-#endif
--- a/aom_dsp/entdec.c
+++ b/aom_dsp/entdec.c
@@ -1,494 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
-#include "aom_dsp/entdec.h"
-
-/*A range decoder.
-  This is an entropy decoder based upon \cite{Mar79}, which is itself a
-   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
-  It is very similar to arithmetic encoding, except that encoding is done with
-   digits in any base, instead of with bits, and so it is faster when using
-   larger bases (i.e.: a byte).
-  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
-   is the base, longer than the theoretical optimum, but to my knowledge there
-   is no published justification for this claim.
-  This only seems true when using near-infinite precision arithmetic so that
-   the process is carried out with no rounding errors.
-
-  An excellent description of implementation details is available at
-   http://www.arturocampos.com/ac_range.html
-  A recent work \cite{MNW98} which proposes several changes to arithmetic
-   encoding for efficiency actually re-discovers many of the principles
-   behind range encoding, and presents a good theoretical analysis of them.
-
-  End of stream is handled by writing out the smallest number of bits that
-   ensures that the stream will be correctly decoded regardless of the value of
-   any subsequent bits.
-  od_ec_dec_tell() can be used to determine how many bits were needed to decode
-   all the symbols thus far; other data can be packed in the remaining bits of
-   the input buffer.
-  @PHDTHESIS{Pas76,
-    author="Richard Clark Pasco",
-    title="Source coding algorithms for fast data compression",
-    school="Dept. of Electrical Engineering, Stanford University",
-    address="Stanford, CA",
-    month=May,
-    year=1976,
-    URL="http://www.richpasco.org/scaffdc.pdf"
-  }
-  @INPROCEEDINGS{Mar79,
-   author="Martin, G.N.N.",
-   title="Range encoding: an algorithm for removing redundancy from a digitised
-    message",
-   booktitle="Video & Data Recording Conference",
-   year=1979,
-   address="Southampton",
-   month=Jul,
-   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
-  }
-  @ARTICLE{MNW98,
-   author="Alistair Moffat and Radford Neal and Ian H. Witten",
-   title="Arithmetic Coding Revisited",
-   journal="{ACM} Transactions on Information Systems",
-   year=1998,
-   volume=16,
-   number=3,
-   pages="256--294",
-   month=Jul,
-   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
-  }*/
-
-/*This is meant to be a large, positive constant that can still be efficiently
-   loaded as an immediate (on platforms like ARM, for example).
-  Even relatively modest values like 100 would work fine.*/
-#define OD_EC_LOTS_OF_BITS (0x4000)
-
-static void od_ec_dec_refill(od_ec_dec *dec) {
-  int s;
-  od_ec_window dif;
-  int16_t cnt;
-  const unsigned char *bptr;
-  const unsigned char *end;
-  dif = dec->dif;
-  cnt = dec->cnt;
-  bptr = dec->bptr;
-  end = dec->end;
-  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
-  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
-    OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8);
-    dif |= (od_ec_window)bptr[0] << s;
-    cnt += 8;
-  }
-  if (bptr >= end) {
-    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
-    cnt = OD_EC_LOTS_OF_BITS;
-  }
-  dec->dif = dif;
-  dec->cnt = cnt;
-  dec->bptr = bptr;
-}
-
-/*Takes updated dif and range values, renormalizes them so that
-   32768 <= rng < 65536 (reading more bytes from the stream into dif if
-   necessary), and stores them back in the decoder context.
-  dif: The new value of dif.
-  rng: The new value of the range.
-  ret: The value to return.
-  Return: ret.
-          This allows the compiler to jump to this function via a tail-call.*/
-static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
-                               int ret) {
-  int d;
-  OD_ASSERT(rng <= 65535U);
-  d = 16 - OD_ILOG_NZ(rng);
-  dec->cnt -= d;
-  dec->dif = dif << d;
-  dec->rng = rng << d;
-  if (dec->cnt < 0) od_ec_dec_refill(dec);
-  return ret;
-}
-
-/*Initializes the decoder.
-  buf: The input buffer to use.
-  Return: 0 on success, or a negative value on error.*/
-void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
-                    uint32_t storage) {
-  dec->buf = buf;
-  dec->eptr = buf + storage;
-  dec->end_window = 0;
-  dec->nend_bits = 0;
-  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
-  dec->end = buf + storage;
-  dec->bptr = buf;
-  dec->dif = 0;
-  dec->rng = 0x8000;
-  dec->cnt = -15;
-  dec->error = 0;
-  od_ec_dec_refill(dec);
-}
-
-/*Decode a bit that has an fz/ft probability of being a zero.
-  fz: The probability that the bit is zero, scaled by _ft.
-  ft: The total probability.
-      This must be at least 16384 and no more than 32768.
-  Return: The value decoded (0 or 1).*/
-int od_ec_decode_bool(od_ec_dec *dec, unsigned fz, unsigned ft) {
-  od_ec_window dif;
-  od_ec_window vw;
-  unsigned r;
-  int s;
-  unsigned v;
-  int ret;
-  OD_ASSERT(0 < fz);
-  OD_ASSERT(fz < ft);
-  OD_ASSERT(16384 <= ft);
-  OD_ASSERT(ft <= 32768U);
-  dif = dec->dif;
-  r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(ft <= r);
-  s = r - ft >= ft;
-  ft <<= s;
-  fz <<= s;
-  OD_ASSERT(r - ft < ft);
-#if OD_EC_REDUCED_OVERHEAD
-  {
-    unsigned d;
-    unsigned e;
-    d = r - ft;
-    e = OD_SUBSATU(2 * d, ft);
-    v = fz + OD_MINI(fz, e) + OD_MINI(OD_SUBSATU(fz, e) >> 1, d);
-  }
-#else
-  v = fz + OD_MINI(fz, r - ft);
-#endif
-  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  ret = dif >= vw;
-  if (ret) dif -= vw;
-  r = ret ? r - v : v;
-  return od_ec_dec_normalize(dec, dif, r, ret);
-}
-
-/*Decode a bit that has an fz probability of being a zero in Q15.
-  This is a simpler, lower overhead version of od_ec_decode_bool() for use when
-   ft == 32768.
-  To be decoded properly by this function, symbols cannot have been encoded by
-   od_ec_encode(), but must have been encoded with one of the equivalent _q15()
-   or _dyadic() functions instead.
-  fz: The probability that the bit is zero, scaled by 32768.
-  Return: The value decoded (0 or 1).*/
-int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned fz) {
-  od_ec_window dif;
-  od_ec_window vw;
-  unsigned r;
-  unsigned r_new;
-  unsigned v;
-  int ret;
-  OD_ASSERT(0 < fz);
-  OD_ASSERT(fz < 32768U);
-  dif = dec->dif;
-  r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(32768U <= r);
-  v = fz * (uint32_t)r >> 15;
-  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  ret = 0;
-  r_new = v;
-  if (dif >= vw) {
-    r_new = r - v;
-    dif -= vw;
-    ret = 1;
-  }
-  return od_ec_dec_normalize(dec, dif, r_new, ret);
-}
-
-/*Decodes a symbol given a cumulative distribution function (CDF) table.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-increasing, and cdf[nsyms - 1]
-        must be at least 16384, and no more than 32768.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.
-  Return: The decoded symbol s.*/
-int od_ec_decode_cdf(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
-  od_ec_window dif;
-  unsigned r;
-  unsigned c;
-  unsigned d;
-#if OD_EC_REDUCED_OVERHEAD
-  unsigned e;
-#endif
-  int s;
-  unsigned u;
-  unsigned v;
-  unsigned q;
-  unsigned fl;
-  unsigned fh;
-  unsigned ft;
-  int ret;
-  dif = dec->dif;
-  r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(nsyms > 0);
-  ft = cdf[nsyms - 1];
-  OD_ASSERT(16384 <= ft);
-  OD_ASSERT(ft <= 32768U);
-  OD_ASSERT(ft <= r);
-  s = r - ft >= ft;
-  ft <<= s;
-  d = r - ft;
-  OD_ASSERT(d < ft);
-  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
-  q = OD_MAXI((int)(c >> 1), (int)(c - d));
-#if OD_EC_REDUCED_OVERHEAD
-  e = OD_SUBSATU(2 * d, ft);
-  /*The correctness of this inverse partition function is not obvious, but it
-     was checked exhaustively for all possible values of r, ft, and c.
-    TODO: It should be possible to optimize this better than the compiler,
-     given that we do not care about the accuracy of negative results (as we
-     will not use them).
-    It would also be nice to get rid of the 32-bit dividend, as it requires a
-     32x32->64 bit multiply to invert.*/
-  q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
-#endif
-  q >>= s;
-  OD_ASSERT(q<ft>> s);
-  fl = 0;
-  ret = 0;
-  for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
-  OD_ASSERT(fh <= ft >> s);
-  fl <<= s;
-  fh <<= s;
-#if OD_EC_REDUCED_OVERHEAD
-  u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
-  v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
-#else
-  u = fl + OD_MINI(fl, d);
-  v = fh + OD_MINI(fh, d);
-#endif
-  r = v - u;
-  dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
-  return od_ec_dec_normalize(dec, dif, r, ret);
-}
-
-/*Decodes a symbol given a cumulative distribution function (CDF) table.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-increasing, and cdf[nsyms - 1]
-       must be at least 2, and no more than 32768.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.
-  Return: The decoded symbol s.*/
-int od_ec_decode_cdf_unscaled(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
-  od_ec_window dif;
-  unsigned r;
-  unsigned c;
-  unsigned d;
-#if OD_EC_REDUCED_OVERHEAD
-  unsigned e;
-#endif
-  int s;
-  unsigned u;
-  unsigned v;
-  unsigned q;
-  unsigned fl;
-  unsigned fh;
-  unsigned ft;
-  int ret;
-  dif = dec->dif;
-  r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(nsyms > 0);
-  ft = cdf[nsyms - 1];
-  OD_ASSERT(2 <= ft);
-  OD_ASSERT(ft <= 32768U);
-  s = 15 - OD_ILOG_NZ(ft - 1);
-  ft <<= s;
-  OD_ASSERT(ft <= r);
-  if (r - ft >= ft) {
-    ft <<= 1;
-    s++;
-  }
-  d = r - ft;
-  OD_ASSERT(d < ft);
-  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
-  q = OD_MAXI((int)(c >> 1), (int)(c - d));
-#if OD_EC_REDUCED_OVERHEAD
-  e = OD_SUBSATU(2 * d, ft);
-  /*TODO: See TODO above.*/
-  q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
-#endif
-  q >>= s;
-  OD_ASSERT(q<ft>> s);
-  fl = 0;
-  ret = 0;
-  for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
-  OD_ASSERT(fh <= ft >> s);
-  fl <<= s;
-  fh <<= s;
-#if OD_EC_REDUCED_OVERHEAD
-  u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
-  v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
-#else
-  u = fl + OD_MINI(fl, d);
-  v = fh + OD_MINI(fh, d);
-#endif
-  r = v - u;
-  dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
-  return od_ec_dec_normalize(dec, dif, r, ret);
-}
-
-/*Decodes a symbol given a cumulative distribution function (CDF) table that
-   sums to a power of two.
-  This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
-   cdf[nsyms - 1] is a power of two.
-  To be decoded properly by this function, symbols cannot have been encoded by
-   od_ec_encode(), but must have been encoded with one of the equivalent _q15()
-   functions instead.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-increasing, and cdf[nsyms - 1]
-       must be exactly 1 << ftb.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.
-  ftb: The number of bits of precision in the cumulative distribution.
-       This must be no more than 15.
-  Return: The decoded symbol s.*/
-int od_ec_decode_cdf_unscaled_dyadic(od_ec_dec *dec, const uint16_t *cdf,
-                                     int nsyms, unsigned ftb) {
-  od_ec_window dif;
-  unsigned r;
-  unsigned c;
-  unsigned u;
-  unsigned v;
-  int ret;
-  (void)nsyms;
-  dif = dec->dif;
-  r = dec->rng;
-  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  OD_ASSERT(ftb <= 15);
-  OD_ASSERT(cdf[nsyms - 1] == 1U << ftb);
-  OD_ASSERT(32768U <= r);
-  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
-  v = 0;
-  ret = -1;
-  do {
-    u = v;
-    v = cdf[++ret] * (uint32_t)r >> ftb;
-  } while (v <= c);
-  OD_ASSERT(v <= r);
-  r = v - u;
-  dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
-  return od_ec_dec_normalize(dec, dif, r, ret);
-}
-
-/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15.
-  This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
-   cdf[nsyms - 1] == 32768.
-  To be decoded properly by this function, symbols cannot have been encoded by
-   od_ec_encode(), but must have been encoded with one of the equivalent _q15()
-   or dyadic() functions instead.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-increasing, and cdf[nsyms - 1]
-        must be 32768.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.
-  Return: The decoded symbol s.*/
-int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
-  return od_ec_decode_cdf_unscaled_dyadic(dec, cdf, nsyms, 15);
-}
-
-/*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
-  The integer must have been encoded with od_ec_enc_uint().
-  ft: The number of integers that can be decoded (one more than the max).
-      This must be at least 2, and no more than 2**29.
-  Return: The decoded bits.*/
-uint32_t od_ec_dec_uint(od_ec_dec *dec, uint32_t ft) {
-  OD_ASSERT(ft >= 2);
-  OD_ASSERT(ft <= (uint32_t)1 << (25 + OD_EC_UINT_BITS));
-  if (ft > 1U << OD_EC_UINT_BITS) {
-    uint32_t t;
-    int ft1;
-    int ftb;
-    ft--;
-    ftb = OD_ILOG_NZ(ft) - OD_EC_UINT_BITS;
-    ft1 = (int)(ft >> ftb) + 1;
-    t = od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft1), ft1);
-    t = t << ftb | od_ec_dec_bits(dec, ftb, "");
-    if (t <= ft) return t;
-    dec->error = 1;
-    return ft;
-  }
-  return od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft), (int)ft);
-}
-
-/*Extracts a sequence of raw bits from the stream.
-  The bits must have been encoded with od_ec_enc_bits().
-  ftb: The number of bits to extract.
-       This must be between 0 and 25, inclusive.
-  Return: The decoded bits.*/
-uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
-  od_ec_window window;
-  int available;
-  uint32_t ret;
-  OD_ASSERT(ftb <= 25);
-  window = dec->end_window;
-  available = dec->nend_bits;
-  if ((unsigned)available < ftb) {
-    const unsigned char *buf;
-    const unsigned char *eptr;
-    buf = dec->buf;
-    eptr = dec->eptr;
-    OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8);
-    do {
-      if (eptr <= buf) {
-        dec->tell_offs += OD_EC_LOTS_OF_BITS - available;
-        available = OD_EC_LOTS_OF_BITS;
-        break;
-      }
-      window |= (od_ec_window) * --eptr << available;
-      available += 8;
-    } while (available <= OD_EC_WINDOW_SIZE - 8);
-    dec->eptr = eptr;
-  }
-  ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1);
-  window >>= ftb;
-  available -= ftb;
-  dec->end_window = window;
-  dec->nend_bits = available;
-  return ret;
-}
-
-/*Returns the number of bits "used" by the decoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Return: The number of bits.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-int od_ec_dec_tell(const od_ec_dec *dec) {
-  return ((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 - dec->cnt -
-         dec->nend_bits + dec->tell_offs;
-}
-
-/*Returns the number of bits "used" by the decoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Return: The number of bits scaled by 2**OD_BITRES.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
-  return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
-}
--- a/aom_dsp/entdec.h
+++ b/aom_dsp/entdec.h
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#if !defined(_entdec_H)
-#define _entdec_H (1)
-#include <limits.h>
-#include "aom_dsp/entcode.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct od_ec_dec od_ec_dec;
-
-#if OD_ACCOUNTING
-#define OD_ACC_STR , char *acc_str
-#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
-#else
-#define OD_ACC_STR
-#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
-#endif
-
-/*The entropy decoder context.*/
-struct od_ec_dec {
-  /*The start of the current input buffer.*/
-  const unsigned char *buf;
-  /*The read pointer for the raw bits.*/
-  const unsigned char *eptr;
-  /*Bits that will be read from/written at the end.*/
-  od_ec_window end_window;
-  /*Number of valid bits in end_window.*/
-  int nend_bits;
-  /*An offset used to keep track of tell after reaching the end of the stream.
-    This is constant throughout most of the decoding process, but becomes
-     important once we hit the end of the buffer and stop incrementing pointers
-     (and instead pretend cnt/nend_bits have lots of bits).*/
-  int32_t tell_offs;
-  /*The end of the current input buffer.*/
-  const unsigned char *end;
-  /*The read pointer for the entropy-coded bits.*/
-  const unsigned char *bptr;
-  /*The difference between the coded value and the low end of the current
-     range.*/
-  od_ec_window dif;
-  /*The number of values in the current range.*/
-  uint16_t rng;
-  /*The number of bits of data in the current value.*/
-  int16_t cnt;
-  /*Nonzero if an error occurred.*/
-  int error;
-};
-
-/*See entdec.c for further documentation.*/
-
-void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT int od_ec_decode_bool(od_ec_dec *dec, unsigned fz,
-                                            unsigned ft) OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned fz)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT int od_ec_decode_cdf(od_ec_dec *dec, const uint16_t *cdf,
-                                           int nsyms) OD_ARG_NONNULL(1)
-    OD_ARG_NONNULL(2);
-OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
-                                               const uint16_t *cdf, int nsyms)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_unscaled(od_ec_dec *dec,
-                                                    const uint16_t *cdf,
-                                                    int nsyms) OD_ARG_NONNULL(1)
-    OD_ARG_NONNULL(2);
-OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_unscaled_dyadic(od_ec_dec *dec,
-                                                           const uint16_t *cdf,
-                                                           int nsyms,
-                                                           unsigned _ftb)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_uint(od_ec_dec *dec, uint32_t ft)
-    OD_ARG_NONNULL(1);
-
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
-    OD_ARG_NONNULL(1);
-
-OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
-    OD_ARG_NONNULL(1);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif
--- a/aom_dsp/entenc.c
+++ b/aom_dsp/entenc.c
@@ -1,686 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifdef HAVE_CONFIG_H
-#include "./config.h"
-#endif
-
-#include <stdlib.h>
-#include <string.h>
-#include "aom_dsp/entenc.h"
-
-/*A range encoder.
-  See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
-
-  @INPROCEEDINGS{Mar79,
-   author="Martin, G.N.N.",
-   title="Range encoding: an algorithm for removing redundancy from a digitised
-    message",
-   booktitle="Video \& Data Recording Conference",
-   year=1979,
-   address="Southampton",
-   month=Jul,
-   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
-  }
-  @ARTICLE{MNW98,
-   author="Alistair Moffat and Radford Neal and Ian H. Witten",
-   title="Arithmetic Coding Revisited",
-   journal="{ACM} Transactions on Information Systems",
-   year=1998,
-   volume=16,
-   number=3,
-   pages="256--294",
-   month=Jul,
-   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
-  }*/
-
-/*Takes updated low and range values, renormalizes them so that
-   32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
-   necessary), and stores them back in the encoder context.
-  low: The new value of low.
-  rng: The new value of the range.*/
-static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
-                                unsigned rng) {
-  int d;
-  int c;
-  int s;
-  c = enc->cnt;
-  OD_ASSERT(rng <= 65535U);
-  d = 16 - OD_ILOG_NZ(rng);
-  s = c + d;
-  /*TODO: Right now we flush every time we have at least one byte available.
-    Instead we should use an od_ec_window and flush right before we're about to
-     shift bits off the end of the window.
-    For a 32-bit window this is about the same amount of work, but for a 64-bit
-     window it should be a fair win.*/
-  if (s >= 0) {
-    uint16_t *buf;
-    uint32_t storage;
-    uint32_t offs;
-    unsigned m;
-    buf = enc->precarry_buf;
-    storage = enc->precarry_storage;
-    offs = enc->offs;
-    if (offs + 2 > storage) {
-      storage = 2 * storage + 2;
-      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
-      if (buf == NULL) {
-        enc->error = -1;
-        enc->offs = 0;
-        return;
-      }
-      enc->precarry_buf = buf;
-      enc->precarry_storage = storage;
-    }
-    c += 16;
-    m = (1 << c) - 1;
-    if (s >= 8) {
-      OD_ASSERT(offs < storage);
-      buf[offs++] = (uint16_t)(low >> c);
-      low &= m;
-      c -= 8;
-      m >>= 8;
-    }
-    OD_ASSERT(offs < storage);
-    buf[offs++] = (uint16_t)(low >> c);
-    s = c + d - 24;
-    low &= m;
-    enc->offs = offs;
-  }
-  enc->low = low << d;
-  enc->rng = rng << d;
-  enc->cnt = s;
-}
-
-/*Initializes the encoder.
-  size: The initial size of the buffer, in bytes.*/
-void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
-  od_ec_enc_reset(enc);
-  enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
-  enc->storage = size;
-  if (size > 0 && enc->buf == NULL) {
-    enc->storage = 0;
-    enc->error = -1;
-  }
-  enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
-  enc->precarry_storage = size;
-  if (size > 0 && enc->precarry_buf == NULL) {
-    enc->precarry_storage = 0;
-    enc->error = -1;
-  }
-}
-
-/*Reinitializes the encoder.*/
-void od_ec_enc_reset(od_ec_enc *enc) {
-  enc->end_offs = 0;
-  enc->end_window = 0;
-  enc->nend_bits = 0;
-  enc->offs = 0;
-  enc->low = 0;
-  enc->rng = 0x8000;
-  /*This is initialized to -9 so that it crosses zero after we've accumulated
-     one byte + one carry bit.*/
-  enc->cnt = -9;
-  enc->error = 0;
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy = 0;
-  enc->nb_symbols = 0;
-#endif
-}
-
-/*Frees the buffers used by the encoder.*/
-void od_ec_enc_clear(od_ec_enc *enc) {
-  free(enc->precarry_buf);
-  free(enc->buf);
-}
-
-/*Encodes a symbol given its scaled frequency information.
-  The frequency information must be discernable by the decoder, assuming it
-   has read only the previous symbols from the stream.
-  You can change the frequency information, or even the entire source alphabet,
-   so long as the decoder can tell from the context of the previously encoded
-   information that it is supposed to do so as well.
-  fl: The cumulative frequency of all symbols that come before the one to be
-       encoded.
-  fh: The cumulative frequency of all symbols up to and including the one to
-       be encoded.
-      Together with fl, this defines the range [fl, fh) in which the decoded
-       value will fall.
-  ft: The sum of the frequencies of all the symbols.
-      This must be at least 16384, and no more than 32768.*/
-static void od_ec_encode(od_ec_enc *enc, unsigned fl, unsigned fh,
-                         unsigned ft) {
-  od_ec_window l;
-  unsigned r;
-  int s;
-  unsigned d;
-  unsigned u;
-  unsigned v;
-  OD_ASSERT(fl < fh);
-  OD_ASSERT(fh <= ft);
-  OD_ASSERT(16384 <= ft);
-  OD_ASSERT(ft <= 32768U);
-  l = enc->low;
-  r = enc->rng;
-  OD_ASSERT(ft <= r);
-  s = r - ft >= ft;
-  ft <<= s;
-  fl <<= s;
-  fh <<= s;
-  d = r - ft;
-  OD_ASSERT(d < ft);
-#if OD_EC_REDUCED_OVERHEAD
-  {
-    unsigned e;
-    e = OD_SUBSATU(2 * d, ft);
-    u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
-    v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
-  }
-#else
-  u = fl + OD_MINI(fl, d);
-  v = fh + OD_MINI(fh, d);
-#endif
-  r = v - u;
-  l += u;
-  od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(fh - fl) / ft);
-  enc->nb_symbols++;
-#endif
-}
-
-/*Encodes a symbol given its frequency in Q15.
-  This is like od_ec_encode() when ft == 32768, but is simpler and has lower
-   overhead.
-  Symbols encoded with this function cannot be properly decoded with
-   od_ec_decode(), and must be decoded with one of the equivalent _q15()
-   functions instead.
-  fl: The cumulative frequency of all symbols that come before the one to be
-       encoded.
-  fh: The cumulative frequency of all symbols up to and including the one to
-       be encoded.*/
-static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
-  od_ec_window l;
-  unsigned r;
-  unsigned u;
-  unsigned v;
-  OD_ASSERT(fl < fh);
-  OD_ASSERT(fh <= 32768U);
-  l = enc->low;
-  r = enc->rng;
-  OD_ASSERT(32768U <= r);
-  u = fl * (uint32_t)r >> 15;
-  v = fh * (uint32_t)r >> 15;
-  r = v - u;
-  l += u;
-  od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(fh - fl) / 32768.);
-  enc->nb_symbols++;
-#endif
-}
-
-/*Encodes a symbol given its frequency information with an arbitrary scale.
-  This operates just like od_ec_encode(), but does not require that ft be at
-   least 16384.
-  fl: The cumulative frequency of all symbols that come before the one to be
-       encoded.
-  fh: The cumulative frequency of all symbols up to and including the one to
-       be encoded.
-  ft: The sum of the frequencies of all the symbols.
-      This must be at least 2 and no more than 32768.*/
-static void od_ec_encode_unscaled(od_ec_enc *enc, unsigned fl, unsigned fh,
-                                  unsigned ft) {
-  int s;
-  OD_ASSERT(fl < fh);
-  OD_ASSERT(fh <= ft);
-  OD_ASSERT(2 <= ft);
-  OD_ASSERT(ft <= 32768U);
-  s = 15 - OD_ILOG_NZ(ft - 1);
-  od_ec_encode(enc, fl << s, fh << s, ft << s);
-}
-
-/*Encode a bit that has an fz/ft probability of being a zero.
-  val: The value to encode (0 or 1).
-  fz: The probability that val is zero, scaled by ft.
-  ft: The total probability.
-      This must be at least 16384 and no more than 32768.*/
-void od_ec_encode_bool(od_ec_enc *enc, int val, unsigned fz, unsigned ft) {
-  od_ec_window l;
-  unsigned r;
-  int s;
-  unsigned v;
-  OD_ASSERT(0 < fz);
-  OD_ASSERT(fz < ft);
-  OD_ASSERT(16384 <= ft);
-  OD_ASSERT(ft <= 32768U);
-  l = enc->low;
-  r = enc->rng;
-  OD_ASSERT(ft <= r);
-  s = r - ft >= ft;
-  ft <<= s;
-  fz <<= s;
-  OD_ASSERT(r - ft < ft);
-#if OD_EC_REDUCED_OVERHEAD
-  {
-    unsigned d;
-    unsigned e;
-    d = r - ft;
-    e = OD_SUBSATU(2 * d, ft);
-    v = fz + OD_MINI(fz, e) + OD_MINI(OD_SUBSATU(fz, e) >> 1, d);
-  }
-#else
-  v = fz + OD_MINI(fz, r - ft);
-#endif
-  if (val) l += v;
-  r = val ? r - v : v;
-  od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(val ? ft - fz : fz) / ft);
-  enc->nb_symbols++;
-#endif
-}
-
-/*Encode a bit that has an fz probability of being a zero in Q15.
-  This is a simpler, lower overhead version of od_ec_encode_bool() for use when
-   ft == 32768.
-  Symbols encoded with this function cannot be properly decoded with
-   od_ec_decode(), and must be decoded with one of the equivalent _q15()
-   functions instead.
-  val: The value to encode (0 or 1).
-  fz: The probability that val is zero, scaled by 32768.*/
-void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned fz) {
-  od_ec_window l;
-  unsigned r;
-  unsigned v;
-  OD_ASSERT(0 < fz);
-  OD_ASSERT(fz < 32768U);
-  l = enc->low;
-  r = enc->rng;
-  OD_ASSERT(32768U <= r);
-  v = fz * (uint32_t)r >> 15;
-  if (val) l += v;
-  r = val ? r - v : v;
-  od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(val ? 32768 - fz : fz) / 32768.);
-  enc->nb_symbols++;
-#endif
-}
-
-/*Encodes a symbol given a cumulative distribution function (CDF) table.
-  s: The index of the symbol to encode.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-decreasing, and the last value
-        must be at least 16384, and no more than 32768.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.*/
-void od_ec_encode_cdf(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) {
-  OD_ASSERT(s >= 0);
-  OD_ASSERT(s < nsyms);
-  od_ec_encode(enc, s > 0 ? cdf[s - 1] : 0, cdf[s], cdf[nsyms - 1]);
-}
-
-/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
-  This is a simpler, lower overhead version of od_ec_encode_cdf() for use when
-   cdf[nsyms - 1] == 32768.
-  Symbols encoded with this function cannot be properly decoded with
-   od_ec_decode(), and must be decoded with one of the equivalent _q15()
-   functions instead.
-  s: The index of the symbol to encode.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-decreasing, and the last value
-        must be exactly 32768.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.*/
-void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf,
-                          int nsyms) {
-  (void)nsyms;
-  OD_ASSERT(s >= 0);
-  OD_ASSERT(s < nsyms);
-  OD_ASSERT(cdf[nsyms - 1] == 32768U);
-  od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : 0, cdf[s]);
-}
-
-/*Encodes a symbol given a cumulative distribution function (CDF) table.
-  s: The index of the symbol to encode.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-decreasing, and the last value
-        must be at least 2, and no more than 32768.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.*/
-void od_ec_encode_cdf_unscaled(od_ec_enc *enc, int s, const uint16_t *cdf,
-                               int nsyms) {
-  OD_ASSERT(s >= 0);
-  OD_ASSERT(s < nsyms);
-  od_ec_encode_unscaled(enc, s > 0 ? cdf[s - 1] : 0, cdf[s], cdf[nsyms - 1]);
-}
-
-/*Equivalent to od_ec_encode_cdf_q15() with the cdf scaled by
-   (1 << (15 - ftb)).
-  s: The index of the symbol to encode.
-  cdf: The CDF, such that symbol s falls in the range
-        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
-       The values must be monotonically non-decreasing, and the last value
-        must be exactly 1 << ftb.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.
-  ftb: The number of bits of precision in the cumulative distribution.
-       This must be no more than 15.*/
-void od_ec_encode_cdf_unscaled_dyadic(od_ec_enc *enc, int s,
-                                      const uint16_t *cdf, int nsyms,
-                                      unsigned ftb) {
-  (void)nsyms;
-  OD_ASSERT(s >= 0);
-  OD_ASSERT(s < nsyms);
-  OD_ASSERT(ftb <= 15);
-  OD_ASSERT(cdf[nsyms - 1] == 1U << ftb);
-  od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] << (15 - ftb) : 0,
-                   cdf[s] << (15 - ftb));
-}
-
-/*Encodes a raw unsigned integer in the stream.
-  fl: The integer to encode.
-  ft: The number of integers that can be encoded (one more than the max).
-      This must be at least 2, and no more than 2**29.*/
-void od_ec_enc_uint(od_ec_enc *enc, uint32_t fl, uint32_t ft) {
-  OD_ASSERT(ft >= 2);
-  OD_ASSERT(fl < ft);
-  OD_ASSERT(ft <= (uint32_t)1 << (25 + OD_EC_UINT_BITS));
-  if (ft > 1U << OD_EC_UINT_BITS) {
-    int ft1;
-    int ftb;
-    ft--;
-    ftb = OD_ILOG_NZ(ft) - OD_EC_UINT_BITS;
-    ft1 = (int)(ft >> ftb) + 1;
-    od_ec_encode_cdf_q15(enc, (int)(fl >> ftb), OD_UNIFORM_CDF_Q15(ft1), ft1);
-    od_ec_enc_bits(enc, fl & (((uint32_t)1 << ftb) - 1), ftb);
-  } else {
-    od_ec_encode_cdf_q15(enc, (int)fl, OD_UNIFORM_CDF_Q15(ft), (int)ft);
-  }
-}
-
-/*Encodes a sequence of raw bits in the stream.
-  fl: The bits to encode.
-  ftb: The number of bits to encode.
-       This must be between 0 and 25, inclusive.*/
-void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) {
-  od_ec_window end_window;
-  int nend_bits;
-  OD_ASSERT(ftb <= 25);
-  OD_ASSERT(fl < (uint32_t)1 << ftb);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy += ftb;
-#endif
-  end_window = enc->end_window;
-  nend_bits = enc->nend_bits;
-  if (nend_bits + ftb > OD_EC_WINDOW_SIZE) {
-    unsigned char *buf;
-    uint32_t storage;
-    uint32_t end_offs;
-    buf = enc->buf;
-    storage = enc->storage;
-    end_offs = enc->end_offs;
-    if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) {
-      unsigned char *new_buf;
-      uint32_t new_storage;
-      new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3);
-      new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage);
-      if (new_buf == NULL) {
-        enc->error = -1;
-        enc->end_offs = 0;
-        return;
-      }
-      OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs,
-              end_offs);
-      storage = new_storage;
-      free(buf);
-      enc->buf = buf = new_buf;
-      enc->storage = storage;
-    }
-    do {
-      OD_ASSERT(end_offs < storage);
-      buf[storage - ++end_offs] = (unsigned char)end_window;
-      end_window >>= 8;
-      nend_bits -= 8;
-    } while (nend_bits >= 8);
-    enc->end_offs = end_offs;
-  }
-  OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE);
-  end_window |= (od_ec_window)fl << nend_bits;
-  nend_bits += ftb;
-  enc->end_window = end_window;
-  enc->nend_bits = nend_bits;
-}
-
-/*Overwrites a few bits at the very start of an existing stream, after they
-   have already been encoded.
-  This makes it possible to have a few flags up front, where it is easy for
-   decoders to access them without parsing the whole stream, even if their
-   values are not determined until late in the encoding process, without having
-   to buffer all the intermediate symbols in the encoder.
-  In order for this to work, at least nbits bits must have already been encoded
-   using probabilities that are an exact power of two.
-  The encoder can verify the number of encoded bits is sufficient, but cannot
-   check this latter condition.
-  val: The bits to encode (in the least nbits significant bits).
-       They will be decoded in order from most-significant to least.
-  nbits: The number of bits to overwrite.
-         This must be no more than 8.*/
-void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
-  int shift;
-  unsigned mask;
-  OD_ASSERT(nbits >= 0);
-  OD_ASSERT(nbits <= 8);
-  OD_ASSERT(val < 1U << nbits);
-  shift = 8 - nbits;
-  mask = ((1U << nbits) - 1) << shift;
-  if (enc->offs > 0) {
-    /*The first byte has been finalized.*/
-    enc->precarry_buf[0] =
-        (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
-  } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
-    /*The first byte has yet to be output.*/
-    enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
-               (od_ec_window)val << (16 + enc->cnt + shift);
-  } else {
-    /*The encoder hasn't even encoded _nbits of data yet.*/
-    enc->error = -1;
-  }
-}
-
-#if OD_MEASURE_EC_OVERHEAD
-#include <stdio.h>
-#endif
-
-/*Indicates that there are no more symbols to encode.
-  All remaining output bytes are flushed to the output buffer.
-  od_ec_enc_reset() should be called before using the encoder again.
-  bytes: Returns the size of the encoded data in the returned buffer.
-  Return: A pointer to the start of the final buffer, or NULL if there was an
-           encoding error.*/
-unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
-  unsigned char *out;
-  uint32_t storage;
-  uint16_t *buf;
-  uint32_t offs;
-  uint32_t end_offs;
-  int nend_bits;
-  od_ec_window m;
-  od_ec_window e;
-  od_ec_window l;
-  unsigned r;
-  int c;
-  int s;
-  if (enc->error) return NULL;
-#if OD_MEASURE_EC_OVERHEAD
-  {
-    uint32_t tell;
-    /* Don't count the 1 bit we lose to raw bits as overhead. */
-    tell = od_ec_enc_tell(enc) - 1;
-    fprintf(stderr, "overhead: %f%%\n",
-            100 * (tell - enc->entropy) / enc->entropy);
-    fprintf(stderr, "efficiency: %f bits/symbol\n",
-            (double)tell / enc->nb_symbols);
-  }
-#endif
-  /*We output the minimum number of bits that ensures that the symbols encoded
-     thus far will be decoded correctly regardless of the bits that follow.*/
-  l = enc->low;
-  r = enc->rng;
-  c = enc->cnt;
-  s = 9;
-  m = 0x7FFF;
-  e = (l + m) & ~m;
-  while ((e | m) >= l + r) {
-    s++;
-    m >>= 1;
-    e = (l + m) & ~m;
-  }
-  s += c;
-  offs = enc->offs;
-  buf = enc->precarry_buf;
-  if (s > 0) {
-    unsigned n;
-    storage = enc->precarry_storage;
-    if (offs + ((s + 7) >> 3) > storage) {
-      storage = storage * 2 + ((s + 7) >> 3);
-      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
-      if (buf == NULL) {
-        enc->error = -1;
-        return NULL;
-      }
-      enc->precarry_buf = buf;
-      enc->precarry_storage = storage;
-    }
-    n = (1 << (c + 16)) - 1;
-    do {
-      OD_ASSERT(offs < storage);
-      buf[offs++] = (uint16_t)(e >> (c + 16));
-      e &= n;
-      s -= 8;
-      c -= 8;
-      n >>= 8;
-    } while (s > 0);
-  }
-  /*Make sure there's enough room for the entropy-coded bits and the raw
-     bits.*/
-  out = enc->buf;
-  storage = enc->storage;
-  end_offs = enc->end_offs;
-  e = enc->end_window;
-  nend_bits = enc->nend_bits;
-  s = -s;
-  c = OD_MAXI((nend_bits - s + 7) >> 3, 0);
-  if (offs + end_offs + c > storage) {
-    storage = offs + end_offs + c;
-    out = (unsigned char *)realloc(out, sizeof(*out) * storage);
-    if (out == NULL) {
-      enc->error = -1;
-      return NULL;
-    }
-    OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs);
-    enc->buf = out;
-    enc->storage = storage;
-  }
-  /*If we have buffered raw bits, flush them as well.*/
-  while (nend_bits > s) {
-    OD_ASSERT(end_offs < storage);
-    out[storage - ++end_offs] = (unsigned char)e;
-    e >>= 8;
-    nend_bits -= 8;
-  }
-  *nbytes = offs + end_offs;
-  /*Perform carry propagation.*/
-  OD_ASSERT(offs + end_offs <= storage);
-  out = out + storage - (offs + end_offs);
-  c = 0;
-  end_offs = offs;
-  while (offs-- > 0) {
-    c = buf[offs] + c;
-    out[offs] = (unsigned char)c;
-    c >>= 8;
-  }
-  /*Add any remaining raw bits to the last byte.
-    There is guaranteed to be enough room, because nend_bits <= s.*/
-  OD_ASSERT(nend_bits <= 0 || end_offs > 0);
-  if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e;
-  /*Note: Unless there's an allocation error, if you keep encoding into the
-     current buffer and call this function again later, everything will work
-     just fine (you won't get a new packet out, but you will get a single
-     buffer with the new data appended to the old).
-    However, this function is O(N) where N is the amount of data coded so far,
-     so calling it more than once for a given packet is a bad idea.*/
-  return out;
-}
-
-/*Returns the number of bits "used" by the encoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Warning: The value returned by this function can decrease compared to an
-   earlier call, even after encoding more data, if there is an encoding error
-   (i.e., a failure to allocate enough space for the output buffer).
-  Return: The number of bits.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-int od_ec_enc_tell(const od_ec_enc *enc) {
-  /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
-     bit, which we reserve for terminating the stream.*/
-  return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10;
-}
-
-/*Returns the number of bits "used" by the encoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Warning: The value returned by this function can decrease compared to an
-   earlier call, even after encoding more data, if there is an encoding error
-   (i.e., a failure to allocate enough space for the output buffer).
-  Return: The number of bits scaled by 2**OD_BITRES.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
-  return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
-}
-
-/*Saves a entropy coder checkpoint to dst.
-  This allows an encoder to reverse a series of entropy coder
-   decisions if it decides that the information would have been
-   better coded some other way.*/
-void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) {
-  OD_COPY(dst, src, 1);
-}
-
-/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint.
-  This can only be used to restore from checkpoints earlier in the target
-   state's history: you can not switch backwards and forwards or otherwise
-   switch to a state which isn't a casual ancestor of the current state.
-  Restore is also incompatible with patching the initial bits, as the
-   changes will remain in the restored version.*/
-void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
-  unsigned char *buf;
-  uint32_t storage;
-  uint16_t *precarry_buf;
-  uint32_t precarry_storage;
-  OD_ASSERT(dst->storage >= src->storage);
-  OD_ASSERT(dst->precarry_storage >= src->precarry_storage);
-  buf = dst->buf;
-  storage = dst->storage;
-  precarry_buf = dst->precarry_buf;
-  precarry_storage = dst->precarry_storage;
-  OD_COPY(dst, src, 1);
-  dst->buf = buf;
-  dst->storage = storage;
-  dst->precarry_buf = precarry_buf;
-  dst->precarry_storage = precarry_storage;
-}
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#if !defined(_entenc_H)
-#define _entenc_H (1)
-#include <stddef.h>
-#include "aom_dsp/entcode.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct od_ec_enc od_ec_enc;
-
-#define OD_MEASURE_EC_OVERHEAD (0)
-
-/*The entropy encoder context.*/
-struct od_ec_enc {
-  /*Buffered output.
-    This contains only the raw bits until the final call to od_ec_enc_done(),
-     where all the arithmetic-coded data gets prepended to it.*/
-  unsigned char *buf;
-  /*The size of the buffer.*/
-  uint32_t storage;
-  /*The offset at which the last byte containing raw bits was written.*/
-  uint32_t end_offs;
-  /*Bits that will be read from/written at the end.*/
-  od_ec_window end_window;
-  /*Number of valid bits in end_window.*/
-  int nend_bits;
-  /*A buffer for output bytes with their associated carry flags.*/
-  uint16_t *precarry_buf;
-  /*The size of the pre-carry buffer.*/
-  uint32_t precarry_storage;
-  /*The offset at which the next entropy-coded byte will be written.*/
-  uint32_t offs;
-  /*The low end of the current range.*/
-  od_ec_window low;
-  /*The number of values in the current range.*/
-  uint16_t rng;
-  /*The number of bits of data in the current value.*/
-  int16_t cnt;
-  /*Nonzero if an error occurred.*/
-  int error;
-#if OD_MEASURE_EC_OVERHEAD
-  double entropy;
-  int nb_symbols;
-#endif
-};
-
-/*See entenc.c for further documentation.*/
-
-void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
-void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
-void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
-
-void od_ec_encode_bool(od_ec_enc *enc, int val, unsigned fz, unsigned _ft)
-    OD_ARG_NONNULL(1);
-void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned fz_q15)
-    OD_ARG_NONNULL(1);
-void od_ec_encode_cdf(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
-void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
-void od_ec_encode_cdf_unscaled(od_ec_enc *enc, int s, const uint16_t *cdf,
-                               int nsyms) OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
-void od_ec_encode_cdf_unscaled_dyadic(od_ec_enc *enc, int s,
-                                      const uint16_t *cdf, int nsyms,
-                                      unsigned ftb) OD_ARG_NONNULL(1)
-    OD_ARG_NONNULL(3);
-
-void od_ec_enc_uint(od_ec_enc *enc, uint32_t fl, uint32_t ft) OD_ARG_NONNULL(1);
-
-void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
-    OD_ARG_NONNULL(1);
-
-void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
-                                                    uint32_t *nbytes)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
-    OD_ARG_NONNULL(1);
-
-void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
-void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif
--- a/aom_dsp/fwd_txfm.h
+++ b/aom_dsp/fwd_txfm.h
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_FWD_TXFM_H_
-#define AOM_DSP_FWD_TXFM_H_
-
-#include "aom_dsp/txfm_common.h"
-
-static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  // TODO(debargha, peter.derivaz): Find new bounds for this assert
-  // and make the bounds consts.
-  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
-  return rv;
-}
-
-void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif  // AOM_DSP_FWD_TXFM_H_
--- a/aom_dsp/mips/common_dspr2.c
+++ b/aom_dsp/mips/common_dspr2.c
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
-uint8_t *aom_ff_cropTbl;
-
-void aom_dsputil_static_init(void) {
-  int i;
-
-  for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
-
-  for (i = 0; i < CROP_WIDTH; i++) {
-    aom_ff_cropTbl_a[i] = 0;
-    aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
-  }
-
-  aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
-}
-
-#endif
--- a/aom_dsp/prob.c
+++ b/aom_dsp/prob.c
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-
-#if CONFIG_EC_MULTISYMBOL
-#include <string.h>
-#endif
-
-#include "aom_dsp/prob.h"
-
-#if CONFIG_DAALA_EC
-#include "aom_dsp/entcode.h"
-#endif
-
-const uint8_t aom_norm[256] = {
-  0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-  3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-
-static unsigned int tree_merge_probs_impl(unsigned int i,
-                                          const aom_tree_index *tree,
-                                          const aom_prob *pre_probs,
-                                          const unsigned int *counts,
-                                          aom_prob *probs) {
-  const int l = tree[i];
-  const unsigned int left_count =
-      (l <= 0) ? counts[-l]
-               : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
-  const int r = tree[i + 1];
-  const unsigned int right_count =
-      (r <= 0) ? counts[-r]
-               : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
-  const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
-  return left_count + right_count;
-}
-
-void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
-                          const unsigned int *counts, aom_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
-}
-
-#if CONFIG_EC_MULTISYMBOL
-typedef struct tree_node tree_node;
-
-struct tree_node {
-  aom_tree_index index;
-  uint8_t probs[16];
-  uint8_t prob;
-  int path;
-  int len;
-  int l;
-  int r;
-  aom_cdf_prob pdf;
-};
-
-/* Compute the probability of this node in Q23 */
-static uint32_t tree_node_prob(tree_node n, int i) {
-  uint32_t prob;
-  /* 1.0 in Q23 */
-  prob = 16777216;
-  for (; i < n.len; i++) {
-    prob = prob * n.probs[i] >> 8;
-  }
-  return prob;
-}
-
-static int tree_node_cmp(tree_node a, tree_node b) {
-  int i;
-  uint32_t pa;
-  uint32_t pb;
-  for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
-  }
-  pa = tree_node_prob(a, i);
-  pb = tree_node_prob(b, i);
-  return pa > pb ? 1 : pa < pb ? -1 : 0;
-}
-
-/* Given a Q15 probability for symbol subtree rooted at tree[n], this function
-    computes the probability of each symbol (defined as a node that has no
-    children). */
-static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n,
-                                            aom_cdf_prob pdf) {
-  if (tree[n].l == 0) {
-    /* This prevents probability computations in Q15 that underflow from
-        producing a symbol that has zero probability. */
-    if (pdf == 0) pdf = 1;
-    tree[n].pdf = pdf;
-    return pdf;
-  } else {
-    /* We process the smaller probability first,  */
-    if (tree[n].prob < 128) {
-      aom_cdf_prob lp;
-      aom_cdf_prob rp;
-      lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8;
-      lp = tree_node_compute_probs(tree, tree[n].l, lp);
-      rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp);
-      return lp + rp;
-    } else {
-      aom_cdf_prob rp;
-      aom_cdf_prob lp;
-      rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8;
-      rp = tree_node_compute_probs(tree, tree[n].r, rp);
-      lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp);
-      return lp + rp;
-    }
-  }
-}
-
-static int tree_node_extract(tree_node *tree, int n, int symb,
-                             aom_cdf_prob *pdf, aom_tree_index *index,
-                             int *path, int *len) {
-  if (tree[n].l == 0) {
-    pdf[symb] = tree[n].pdf;
-    if (index != NULL) index[symb] = tree[n].index;
-    if (path != NULL) path[symb] = tree[n].path;
-    if (len != NULL) len[symb] = tree[n].len;
-    return symb + 1;
-  } else {
-    symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len);
-    return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len);
-  }
-}
-
-int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
-                aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index,
-                int *path, int *len) {
-  tree_node symb[2 * 16 - 1];
-  int nodes;
-  int next[16];
-  int size;
-  int nsymbs;
-  int i;
-  /* Create the root node with probability 1 in Q15. */
-  symb[0].index = root;
-  symb[0].path = 0;
-  symb[0].len = 0;
-  symb[0].l = symb[0].r = 0;
-  nodes = 1;
-  next[0] = 0;
-  size = 1;
-  nsymbs = 1;
-  while (size > 0 && nsymbs < 16) {
-    int m;
-    tree_node n;
-    aom_tree_index j;
-    uint8_t prob;
-    m = 0;
-    /* Find the internal node with the largest probability. */
-    for (i = 1; i < size; i++) {
-      if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i;
-    }
-    i = next[m];
-    memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1)));
-    size--;
-    /* Split this symbol into two symbols */
-    n = symb[i];
-    j = n.index;
-    prob = probs[j >> 1];
-    /* Left */
-    n.index = tree[j];
-    n.path <<= 1;
-    n.len++;
-    n.probs[n.len - 1] = prob;
-    symb[nodes] = n;
-    if (n.index > 0) {
-      next[size++] = nodes;
-    }
-    /* Right */
-    n.index = tree[j + 1];
-    n.path += 1;
-    n.probs[n.len - 1] = 256 - prob;
-    symb[nodes + 1] = n;
-    if (n.index > 0) {
-      next[size++] = nodes + 1;
-    }
-    symb[i].prob = prob;
-    symb[i].l = nodes;
-    symb[i].r = nodes + 1;
-    nodes += 2;
-    nsymbs++;
-  }
-  /* Compute the probabilities of each symbol in Q15 */
-  tree_node_compute_probs(symb, 0, 32768);
-  /* Extract the cdf, index, path and length */
-  tree_node_extract(symb, 0, 0, cdf, index, path, len);
-  /* Convert to CDF */
-  for (i = 1; i < nsymbs; i++) {
-    cdf[i] = cdf[i - 1] + cdf[i];
-  }
-  return nsymbs;
-}
-
-/* This code assumes that tree contains as unique leaf nodes the integer values
-    0 to len - 1 and produces the forward and inverse mapping tables in ind[]
-    and inv[] respectively. */
-void av1_indices_from_tree(int *ind, int *inv, int len,
-                           const aom_tree_index *tree) {
-  int i;
-  int index;
-  for (i = index = 0; i < TREE_SIZE(len); i++) {
-    const aom_tree_index j = tree[i];
-    if (j <= 0) {
-      inv[index] = -j;
-      ind[-j] = index++;
-    }
-  }
-}
-#endif
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_PROB_H_
-#define AOM_DSP_PROB_H_
-
-#include "./aom_config.h"
-#include "./aom_dsp_common.h"
-
-#include "aom_ports/bitops.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef uint8_t aom_prob;
-
-// TODO(negge): Rename this aom_prob once we remove vpxbool.
-typedef uint16_t aom_cdf_prob;
-
-#define MAX_PROB 255
-
-#define aom_prob_half ((aom_prob)128)
-
-typedef int8_t aom_tree_index;
-
-#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
-
-#define aom_complement(x) (255 - x)
-
-#define MODE_MV_COUNT_SAT 20
-
-/* We build coding trees compactly in arrays.
-   Each node of the tree is a pair of aom_tree_indices.
-   Array index often references a corresponding probability table.
-   Index <= 0 means done encoding/decoding and value = -Index,
-   Index > 0 means need another bit, specification at index.
-   Nonnegative indices are always even;  processing begins at node 0. */
-
-typedef const aom_tree_index aom_tree[];
-
-static INLINE aom_prob clip_prob(int p) {
-  return (p > 255) ? 255 : (p < 1) ? 1 : p;
-}
-
-static INLINE aom_prob get_prob(int num, int den) {
-  return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
-}
-
-static INLINE aom_prob get_binary_prob(int n0, int n1) {
-  return get_prob(n0, n0 + n1);
-}
-
-/* This function assumes prob1 and prob2 are already within [1,255] range. */
-static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) {
-  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
-}
-
-static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  const aom_prob prob = get_binary_prob(ct[0], ct[1]);
-  const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat);
-  const unsigned int factor = max_update_factor * count / count_sat;
-  return weighted_prob(pre_prob, prob, factor);
-}
-
-// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
-static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
-  0,  6,  12, 19, 25, 32,  38,  44,  51,  57, 64,
-  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
-};
-
-static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob,
-                                           const unsigned int ct[2]) {
-  const unsigned int den = ct[0] + ct[1];
-  if (den == 0) {
-    return pre_prob;
-  } else {
-    const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT);
-    const unsigned int factor = count_to_update_factor[count];
-    const aom_prob prob =
-        clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
-    return weighted_prob(pre_prob, prob, factor);
-  }
-}
-
-void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
-                          const unsigned int *counts, aom_prob *probs);
-
-#if CONFIG_EC_MULTISYMBOL
-int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
-                aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
-                int *pth, int *len);
-
-static INLINE void av1_tree_to_cdf(const aom_tree_index *tree,
-                                   const aom_prob *probs, aom_cdf_prob *cdf) {
-  aom_tree_index index[16];
-  int path[16];
-  int dist[16];
-  tree_to_cdf(tree, probs, 0, cdf, index, path, dist);
-}
-
-#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \
-  do {                                          \
-    int i;                                      \
-    for (i = 0; i < u; i++) {                   \
-      av1_tree_to_cdf(tree, probs[i], cdf[i]);  \
-    }                                           \
-  } while (0)
-
-#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u)     \
-  do {                                                 \
-    int j;                                             \
-    int i;                                             \
-    for (j = 0; j < v; j++) {                          \
-      for (i = 0; i < u; i++) {                        \
-        av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \
-      }                                                \
-    }                                                  \
-  } while (0)
-
-void av1_indices_from_tree(int *ind, int *inv, int len,
-                           const aom_tree_index *tree);
-#endif
-
-DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]);
-
-#if CONFIG_EC_ADAPT
-static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
-  const int rate = 4 + get_msb(nsymbs);
-  int i, diff, tmp;
-  for (i = 0; i < nsymbs; ++i) {
-    tmp = (i + 1) << (12 - rate);
-    cdf[i] -= ((cdf[i] - tmp) >> rate);
-  }
-  diff = 32768 - cdf[nsymbs - 1];
-
-  for (i = val; i < nsymbs; ++i) {
-    cdf[i] += diff;
-  }
-}
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_PROB_H_
--- a/aom_dsp/psnr.h
+++ b/aom_dsp/psnr.h
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_PSNR_H_
-#define AOM_DSP_PSNR_H_
-
-#include "aom_scale/yv12config.h"
-
-#define MAX_PSNR 100.0
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
-} PSNR_STATS;
-
-/*!\brief Converts SSE to PSNR
-*
-* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
-*
-* \param[in]    samples       Number of samples
-* \param[in]    peak          Max sample value
-* \param[in]    sse           Sum of squared errors
-*/
-double aom_sse_to_psnr(double samples, double peak, double sse);
-int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height);
-int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-#if CONFIG_AOM_HIGHBITDEPTH
-int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                                  const YV12_BUFFER_CONFIG *b, int hstart,
-                                  int width, int vstart, int height);
-int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
-                          unsigned int bit_depth, unsigned int in_bit_depth);
-#endif
-void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                   PSNR_STATS *psnr);
-
-double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *phvs_y,
-                   double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-#endif  // AOM_DSP_PSNR_H_
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@@ -1,686 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/quantize.h"
-#include "aom_mem/aom_mem.h"
-
-#if CONFIG_AOM_QM
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr,
-                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp, eob = -1;
-  int32_t tmp32;
-  int dequant =
-      (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                            const qm_val_t *iqm_ptr) {
-  int eob = -1;
-  int dequant =
-      (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS));
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr,
-                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int64_t tmp, eob = -1;
-  int32_t tmp32;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-    dequant =
-        (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
-    if (tmp32) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr, const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr,
-                                  const qm_val_t *qm_ptr,
-                                  const qm_val_t *iqm_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-  int dequant;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS));
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dequant =
-        (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan, const qm_val_t *qm_ptr,
-                      const qm_val_t *iqm_ptr) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc] * wt;
-
-      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
-          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int dequant;
-
-      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
-        int32_t tmp32;
-        int64_t tmp =
-            clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = tmp * wt;
-        tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                 quant_shift_ptr[rc != 0]) >>
-                (16 + AOM_QM_BITS);  // quantization
-        dequant =
-            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
-            AOM_QM_BITS;
-        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-
-        if (tmp32) eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  int dequant;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc] * wt;
-
-      if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
-          coeff > (nzbins[rc != 0] << AOM_QM_BITS))
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmpw = tmp1 * wt;
-        const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS));
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dequant =
-            (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
-            AOM_QM_BITS;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-        if (abs_qcoeff) eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan,
-                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  int dequant;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc] * wt;
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
-          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const qm_val_t wt = qm_ptr[rc];
-      int64_t tmp;
-      int tmp32;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = tmp * wt;
-      tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >>
-              (15 + AOM_QM_BITS);
-
-      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-      dequant =
-          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
-
-      if (tmp32) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  int dequant;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const qm_val_t wt = qm_ptr[rc];
-      const int coeff = coeff_ptr[rc] * wt;
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
-          coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const qm_val_t wt = qm_ptr[rc];
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmpw = tmp1 * wt;
-      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS));
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dequant =
-          (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-#else
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
-    if (tmp) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr) {
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  const int rc = 0;
-  const int coeff = coeff_ptr[rc];
-  const int coeff_sign = (coeff >> 31);
-  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-  int tmp, eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
-    if (tmp) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr, const int16_t quant,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
-  const int n_coeffs = 1024;
-  int eob = -1;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    const int coeff = coeff_ptr[0];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
-    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
-    if (abs_qcoeff) eob = 0;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >>
-              16;  // quantization
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp) eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
-
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff) eob = i;
-      }
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-
-void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-             quant_shift_ptr[rc != 0]) >>
-            15;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-
-      if (tmp) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
-
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-#endif
--- a/aom_dsp/quantize.h
+++ b/aom_dsp/quantize.h
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_QUANTIZE_H_
-#define AOM_DSP_QUANTIZE_H_
-
-#include "./aom_config.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if CONFIG_AOM_QM
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr,
-                     const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr,
-                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan, const qm_val_t *qm_ptr,
-                      const qm_val_t *iqm_ptr);
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-                            const qm_val_t *iqm_ptr);
-void aom_highbd_quantize_dc_32x32(
-    const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
-    const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr);
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan,
-                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
-#endif
-#else
-void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
-                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
-void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
-                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
-                            uint16_t *eob_ptr);
-void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
-                                  tran_low_t *qcoeff_ptr,
-                                  tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
-#endif
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_QUANTIZE_H_
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c
@@ -1,512 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-/* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sad;
-}
-
-#define sadMxN(m, n)                                                        \
-  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
-                                    const uint8_t *ref, int ref_stride) {   \
-    return sad(src, src_stride, ref, ref_stride, m, n);                     \
-  }                                                                         \
-  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                        const uint8_t *ref, int ref_stride, \
-                                        const uint8_t *second_pred) {       \
-    uint8_t comp_pred[m * n];                                               \
-    aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
-    return sad(src, src_stride, comp_pred, m, m, n);                        \
-  }
-
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                   \
-  void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
-                                  const uint8_t *ref_array, int ref_stride, \
-                                  uint32_t *sad_array) {                    \
-    int i;                                                                  \
-    for (i = 0; i < k; ++i)                                                 \
-      sad_array[i] =                                                        \
-          aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
-  }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n)                                                    \
-  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
-                               const uint8_t *const ref_array[],           \
-                               int ref_stride, uint32_t *sad_array) {      \
-    int i;                                                                 \
-    for (i = 0; i < 4; ++i)                                                \
-      sad_array[i] =                                                       \
-          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
-  }
-
-/* clang-format off */
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
-// 128x128
-sadMxN(128, 128)
-sadMxNxK(128, 128, 3)
-sadMxNxK(128, 128, 8)
-sadMxNx4D(128, 128)
-
-// 128x64
-sadMxN(128, 64)
-sadMxNx4D(128, 64)
-
-// 64x128
-sadMxN(64, 128)
-sadMxNx4D(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
-
-// 64x64
-sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
-sadMxNx4D(64, 64)
-
-// 64x32
-sadMxN(64, 32)
-sadMxNx4D(64, 32)
-
-// 32x64
-sadMxN(32, 64)
-sadMxNx4D(32, 64)
-
-// 32x32
-sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
-sadMxNx4D(32, 32)
-
-// 32x16
-sadMxN(32, 16)
-sadMxNx4D(32, 16)
-
-// 16x32
-sadMxN(16, 32)
-sadMxNx4D(16, 32)
-
-// 16x16
-sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
-sadMxNx4D(16, 16)
-
-// 16x8
-sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
-sadMxNx4D(16, 8)
-
-// 8x16
-sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
-sadMxNx4D(8, 16)
-
-// 8x8
-sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
-sadMxNx4D(8, 8)
-
-// 8x4
-sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
-sadMxNx4D(8, 4)
-
-// 4x8
-sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
-sadMxNx4D(4, 8)
-
-// 4x4
-sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
-sadMxNx4D(4, 4)
-/* clang-format on */
-
-#if CONFIG_AOM_HIGHBITDEPTH
-        static INLINE
-    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sad;
-}
-
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
-                                       const uint16_t *b, int b_stride,
-                                       int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sad;
-}
-
-#define highbd_sadMxN(m, n)                                                    \
-  unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                           const uint8_t *ref,                 \
-                                           int ref_stride) {                   \
-    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
-  }                                                                            \
-  unsigned int aom_highbd_sad##m##x##n##_avg_c(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred) {                                            \
-    uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
-  }
-
-#define highbd_sadMxNxK(m, n, k)                                             \
-  void aom_highbd_sad##m##x##n##x##k##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref_array,          \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    int i;                                                                   \
-    for (i = 0; i < k; ++i) {                                                \
-      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 &ref_array[i], ref_stride); \
-    }                                                                        \
-  }
-
-#define highbd_sadMxNx4D(m, n)                                               \
-  void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 ref_array[i], ref_stride);  \
-    }                                                                        \
-  }
-
-/* clang-format off */
-#if CONFIG_AV1 && CONFIG_EXT_PARTITION
-// 128x128
-highbd_sadMxN(128, 128)
-highbd_sadMxNxK(128, 128, 3)
-highbd_sadMxNxK(128, 128, 8)
-highbd_sadMxNx4D(128, 128)
-
-// 128x64
-highbd_sadMxN(128, 64)
-highbd_sadMxNx4D(128, 64)
-
-// 64x128
-highbd_sadMxN(64, 128)
-highbd_sadMxNx4D(64, 128)
-#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
-
-// 64x64
-highbd_sadMxN(64, 64)
-highbd_sadMxNxK(64, 64, 3)
-highbd_sadMxNxK(64, 64, 8)
-highbd_sadMxNx4D(64, 64)
-
-// 64x32
-highbd_sadMxN(64, 32)
-highbd_sadMxNx4D(64, 32)
-
-// 32x64
-highbd_sadMxN(32, 64)
-highbd_sadMxNx4D(32, 64)
-
-// 32x32
-highbd_sadMxN(32, 32)
-highbd_sadMxNxK(32, 32, 3)
-highbd_sadMxNxK(32, 32, 8)
-highbd_sadMxNx4D(32, 32)
-
-// 32x16
-highbd_sadMxN(32, 16)
-highbd_sadMxNx4D(32, 16)
-
-// 16x32
-highbd_sadMxN(16, 32)
-highbd_sadMxNx4D(16, 32)
-
-// 16x16
-highbd_sadMxN(16, 16)
-highbd_sadMxNxK(16, 16, 3)
-highbd_sadMxNxK(16, 16, 8)
-highbd_sadMxNx4D(16, 16)
-
-// 16x8
-highbd_sadMxN(16, 8)
-highbd_sadMxNxK(16, 8, 3)
-highbd_sadMxNxK(16, 8, 8)
-highbd_sadMxNx4D(16, 8)
-
-// 8x16
-highbd_sadMxN(8, 16)
-highbd_sadMxNxK(8, 16, 3)
-highbd_sadMxNxK(8, 16, 8)
-highbd_sadMxNx4D(8, 16)
-
-// 8x8
-highbd_sadMxN(8, 8)
-highbd_sadMxNxK(8, 8, 3)
-highbd_sadMxNxK(8, 8, 8)
-highbd_sadMxNx4D(8, 8)
-
-// 8x4
-highbd_sadMxN(8, 4)
-highbd_sadMxNxK(8, 4, 8)
-highbd_sadMxNx4D(8, 4)
-
-// 4x8
-highbd_sadMxN(4, 8)
-highbd_sadMxNxK(4, 8, 8)
-highbd_sadMxNx4D(4, 8)
-
-// 4x4
-highbd_sadMxN(4, 4)
-highbd_sadMxNxK(4, 4, 3)
-highbd_sadMxNxK(4, 4, 8)
-highbd_sadMxNx4D(4, 4)
-/* clang-format on */
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-#if CONFIG_AV1 && CONFIG_EXT_INTER
-            static INLINE
-    unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride, const uint8_t *m, int m_stride,
-                            int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-#define MASKSADMxN(m, n)                                                      \
-  unsigned int aom_masked_sad##m##x##n##_c(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m,   \
-                      n);                                                     \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-MASKSADMxN(128, 128)
-MASKSADMxN(128, 64)
-MASKSADMxN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-MASKSADMxN(64, 64)
-MASKSADMxN(64, 32)
-MASKSADMxN(32, 64)
-MASKSADMxN(32, 32)
-MASKSADMxN(32, 16)
-MASKSADMxN(16, 32)
-MASKSADMxN(16, 16)
-MASKSADMxN(16, 8)
-MASKSADMxN(8, 16)
-MASKSADMxN(8, 8)
-MASKSADMxN(8, 4)
-MASKSADMxN(4, 8)
-MASKSADMxN(4, 4)
-/* clang-format on */
-
-#if CONFIG_AOM_HIGHBITDEPTH
-                    static INLINE
-    unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
-                                   const uint8_t *b8, int b_stride,
-                                   const uint8_t *m, int m_stride, int width,
-                                   int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-#define HIGHBD_MASKSADMXN(m, n)                                               \
-  unsigned int aom_highbd_masked_sad##m##x##n##_c(                            \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return highbd_masked_sad(src, src_stride, ref, ref_stride, msk,           \
-                             msk_stride, m, n);                               \
-  }
-
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN(128, 128)
-HIGHBD_MASKSADMXN(128, 64)
-HIGHBD_MASKSADMXN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN(64, 64)
-HIGHBD_MASKSADMXN(64, 32)
-HIGHBD_MASKSADMXN(32, 64)
-HIGHBD_MASKSADMXN(32, 32)
-HIGHBD_MASKSADMXN(32, 16)
-HIGHBD_MASKSADMXN(16, 32)
-HIGHBD_MASKSADMXN(16, 16)
-HIGHBD_MASKSADMXN(16, 8)
-HIGHBD_MASKSADMXN(8, 16)
-HIGHBD_MASKSADMXN(8, 8)
-HIGHBD_MASKSADMXN(8, 4)
-HIGHBD_MASKSADMXN(4, 8)
-HIGHBD_MASKSADMXN(4, 4)
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
-
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
-// pre: predictor being evaluated
-// wsrc: target weighted prediction (has been *4096 to keep precision)
-// mask: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define OBMCSADMxN(m, n)                                                     \
-  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
-                                         const int32_t *wsrc,                \
-                                         const int32_t *mask) {              \
-    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-OBMCSADMxN(128, 128)
-OBMCSADMxN(128, 64)
-OBMCSADMxN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-OBMCSADMxN(64, 64)
-OBMCSADMxN(64, 32)
-OBMCSADMxN(32, 64)
-OBMCSADMxN(32, 32)
-OBMCSADMxN(32, 16)
-OBMCSADMxN(16, 32)
-OBMCSADMxN(16, 16)
-OBMCSADMxN(16, 8)
-OBMCSADMxN(8, 16)
-OBMCSADMxN(8, 8)
-OBMCSADMxN(8, 4)
-OBMCSADMxN(4, 8)
-OBMCSADMxN(4, 4)
-/* clang-format on */
-
-#if CONFIG_AOM_HIGHBITDEPTH
-                    static INLINE
-    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define HIGHBD_OBMCSADMXN(m, n)                                \
-  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
-      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
-      const int32_t *mask) {                                   \
-    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
-  }
-
-/* clang-format off */
-#if CONFIG_EXT_PARTITION
-HIGHBD_OBMCSADMXN(128, 128)
-HIGHBD_OBMCSADMXN(128, 64)
-HIGHBD_OBMCSADMXN(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_OBMCSADMXN(64, 64)
-HIGHBD_OBMCSADMXN(64, 32)
-HIGHBD_OBMCSADMXN(32, 64)
-HIGHBD_OBMCSADMXN(32, 32)
-HIGHBD_OBMCSADMXN(32, 16)
-HIGHBD_OBMCSADMXN(16, 32)
-HIGHBD_OBMCSADMXN(16, 16)
-HIGHBD_OBMCSADMXN(16, 8)
-HIGHBD_OBMCSADMXN(8, 16)
-HIGHBD_OBMCSADMXN(8, 8)
-HIGHBD_OBMCSADMXN(8, 4)
-HIGHBD_OBMCSADMXN(4, 8)
-HIGHBD_OBMCSADMXN(4, 4)
-/* clang-format on */
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
--- a/aom_dsp/simd/v128_intrinsics.h
+++ b/aom_dsp/simd/v128_intrinsics.h
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V128_INTRINSICS_H
-#define _V128_INTRINSICS_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "./v128_intrinsics_c.h"
-#include "./v64_intrinsics.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v128 v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
-SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
-SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
-SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
-  return c_v128_from_64(hi, lo);
-}
-SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
-  return c_v128_from_v64(hi, lo);
-}
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return c_v128_from_32(a, b, c, d);
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-  return c_v128_load_unaligned(p);
-}
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
-  return c_v128_load_aligned(p);
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
-  c_v128_store_unaligned(p, a);
-}
-SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
-  c_v128_store_aligned(p, a);
-}
-
-SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
-  return c_v128_align(a, b, c);
-}
-
-SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
-SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
-
-typedef uint32_t sad128_internal;
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
-  return c_v128_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return c_v128_sad_u8_sum(s);
-}
-typedef uint32_t ssd128_internal;
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  return c_v128_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return c_v128_ssd_u8_sum(s);
-}
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
-  return c_v128_dotp_s16(a, b);
-}
-SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
-
-SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
-SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
-SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
-SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
-
-SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
-SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
-SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
-SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
-SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
-SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
-SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
-SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
-SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
-SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
-SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
-SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
-  return c_v128_mullo_s16(a, b);
-}
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-  return c_v128_mulhi_s16(a, b);
-}
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-  return c_v128_mullo_s32(a, b);
-}
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
-
-SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
-SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
-SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
-SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
-SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
-SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
-SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
-SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
-SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
-
-SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
-SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
-SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
-SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
-SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
-SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
-SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
-SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
-SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
-SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
-  return c_v128_unziplo_8(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
-  return c_v128_unziphi_8(a, b);
-}
-SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
-  return c_v128_unziplo_16(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
-  return c_v128_unziphi_16(a, b);
-}
-SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
-  return c_v128_unziplo_32(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
-  return c_v128_unziphi_32(a, b);
-}
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
-  return c_v128_unpacklo_u8_s16(a);
-}
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
-  return c_v128_unpackhi_u8_s16(a);
-}
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
-  return c_v128_pack_s32_s16(a, b);
-}
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
-  return c_v128_pack_s16_u8(a, b);
-}
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
-  return c_v128_pack_s16_s8(a, b);
-}
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
-  return c_v128_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
-  return c_v128_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
-  return c_v128_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
-  return c_v128_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
-  return c_v128_shuffle_8(a, pattern);
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
-SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
-SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
-SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
-  return c_v128_cmpgt_s16(a, b);
-}
-SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
-  return c_v128_cmplt_s16(a, b);
-}
-SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return c_v128_shl_8(a, c);
-}
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return c_v128_shr_u8(a, c);
-}
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  return c_v128_shr_s8(a, c);
-}
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return c_v128_shl_16(a, c);
-}
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return c_v128_shr_u16(a, c);
-}
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return c_v128_shr_s16(a, c);
-}
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return c_v128_shl_32(a, c);
-}
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return c_v128_shr_u32(a, c);
-}
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return c_v128_shr_s32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
-  return c_v128_shr_n_byte(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
-  return c_v128_shl_n_byte(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_8(v128 a, const unsigned int n) {
-  return c_v128_shl_n_8(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_16(v128 a, const unsigned int n) {
-  return c_v128_shl_n_16(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_32(v128 a, const unsigned int n) {
-  return c_v128_shl_n_32(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, const unsigned int n) {
-  return c_v128_shr_n_u8(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, const unsigned int n) {
-  return c_v128_shr_n_u16(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, const unsigned int n) {
-  return c_v128_shr_n_u32(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, const unsigned int n) {
-  return c_v128_shr_n_s8(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, const unsigned int n) {
-  return c_v128_shr_n_s16(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, const unsigned int n) {
-  return c_v128_shr_n_s32(a, n);
-}
-
-#endif /* _V128_INTRINSICS_H */
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -1,655 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V128_INTRINSICS_H
-#define _V128_INTRINSICS_H
-
-#include <arm_neon.h>
-#include "./v64_intrinsics_arm.h"
-
-typedef int64x2_t v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) {
-  return v64_low_u32(vget_low_s64(a));
-}
-
-SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
-
-SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
-
-SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
-
-SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
-}
-
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
-}
-
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
-  return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-  return v128_load_aligned(p);
-}
-
-SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
-  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
-  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if __OPTIMIZE__ && !__clang__
-  return c ? vreinterpretq_s64_s8(
-                 vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
-           : b;
-#else
-  return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
-                               v64_align(v128_high_v64(b), v128_low_v64(b), c))
-               : v128_from_v64(
-                     v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
-                     v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
-#endif
-}
-
-SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
-
-SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
-
-SIMD_INLINE v128 v128_dup_8(uint8_t x) {
-  return vreinterpretq_s64_u8(vdupq_n_u8(x));
-}
-
-SIMD_INLINE v128 v128_dup_16(uint16_t x) {
-  return vreinterpretq_s64_u16(vdupq_n_u16(x));
-}
-
-SIMD_INLINE v128 v128_dup_32(uint32_t x) {
-  return vreinterpretq_s64_u32(vdupq_n_u32(x));
-}
-
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
-  return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
-         v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
-}
-
-SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
-  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
-  return vget_lane_s32(
-      vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
-}
-
-SIMD_INLINE v128 v128_padd_s16(v128 a) {
-  return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
-}
-
-typedef struct { sad64_internal hi, lo; } sad128_internal;
-
-SIMD_INLINE sad128_internal v128_sad_u8_init() {
-  sad128_internal s;
-  s.hi = s.lo = vdupq_n_u16(0);
-  return s;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v128_sad_u8_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
-  sad128_internal r;
-  r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
-  r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
-  return r;
-}
-
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo));
-}
-
-typedef struct { ssd64_internal hi, lo; } ssd128_internal;
-
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
-  ssd128_internal s;
-  s.hi = s.lo = (ssd64_internal)(uint64_t)0;
-  return s;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_u8_sum(). */
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  ssd128_internal r;
-  r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
-  r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
-  return r;
-}
-
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
-}
-
-SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
-
-SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
-
-SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
-
-SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
-
-SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
-  return vreinterpretq_s64_s32(
-      vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_abs_s16(v128 x) {
-  return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
-}
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
-  return vreinterpretq_s64_s32(
-      vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
-  return vreinterpretq_s64_s16(
-      vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-  return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
-                       v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-  return vreinterpretq_s64_s32(
-      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
-}
-
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
-  return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
-                       v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
-}
-
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-  return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
-                       v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
-  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[0]);
-}
-
-SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
-  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[1]);
-}
-
-SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
-  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
-  return vreinterpretq_s64_s16(r.val[0]);
-}
-
-SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
-  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
-  return vreinterpretq_s64_s16(r.val[1]);
-}
-
-SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
-  uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
-  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
-  return vreinterpretq_s64_s32(r.val[0]);
-}
-
-SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
-  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
-  return vreinterpretq_s64_s32(r.val[1]);
-}
-
-SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
-  uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
-  return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return v128_from_v64(vget_low_u64((uint64x2_t)a),
-                       vget_low_u64((uint64x2_t)b));
-}
-
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return v128_from_v64(vget_high_u64((uint64x2_t)a),
-                       vget_high_u64((uint64x2_t)b));
-}
-
-SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
-  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[0]);
-}
-
-SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
-  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[1]);
-}
-
-SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
-  uint16x8x2_t r =
-      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
-  return vreinterpretq_s64_u16(r.val[0]);
-}
-
-SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
-  uint16x8x2_t r =
-      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
-  return vreinterpretq_s64_u16(r.val[1]);
-}
-
-SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
-  uint32x4x2_t r =
-      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
-  return vreinterpretq_s64_u32(r.val[0]);
-}
-
-SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
-  uint32x4x2_t r =
-      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
-  return vreinterpretq_s64_u32(r.val[1]);
-}
-
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
-      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
-      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
-      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
-  return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
-  return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
-  return vreinterpretq_s64_u32(
-      vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
-  return vreinterpretq_s64_s32(
-      vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
-  return vreinterpretq_s64_u32(
-      vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
-  return vreinterpretq_s64_s32(
-      vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-  return v128_from_64(
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_high_s64(pattern)))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_low_s64(pattern)))));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
-                                     vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
-                                     vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8(
-                                     vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_zero()
-                  : vreinterpretq_s64_u16(
-                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_zero()
-                  : vreinterpretq_s64_u16(
-                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_ones()
-                  : vreinterpretq_s64_s16(
-                        vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_zero()
-                  : vreinterpretq_s64_u32(
-                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_zero()
-                  : vreinterpretq_s64_u32(
-                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_ones()
-                  : vreinterpretq_s64_s32(
-                        vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
-}
-
-#if __OPTIMIZE__ && !__clang__
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
-  return n < 8
-             ? v128_from_64(
-                   (uint64_t)vorr_u64(
-                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                  n * 8),
-                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                  (8 - n) * 8)),
-                   (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                        n * 8))
-             : (n == 8 ? v128_from_64(
-                             (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
-                       : v128_from_64((uint64_t)vshl_n_u64(
-                                          vreinterpret_u64_s64(vget_low_s64(a)),
-                                          (n - 8) * 8),
-                                      0));
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
-  return n < 8
-             ? v128_from_64(
-                   vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
-                   vorr_u64(
-                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
-                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                  (8 - n) * 8)))
-             : (n == 8
-                    ? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a)))
-                    : v128_from_64(
-                          0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                        (n - 8) * 8)));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, const unsigned int c) {
-  return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
-}
-
-#else
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
-  if (n < 8)
-    return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
-                                v64_shr_n_byte(v128_low_v64(a), 8 - n)),
-                         v64_shl_n_byte(v128_low_v64(a), n));
-  else
-    return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
-  if (n < 8)
-    return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
-                         v64_or(v64_shr_n_byte(v128_low_v64(a), n),
-                                v64_shl_n_byte(v128_high_v64(a), 8 - n)));
-  else
-    return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, const unsigned int c) {
-  return v128_shl_8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, const unsigned int c) {
-  return v128_shr_u8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, const unsigned int c) {
-  return v128_shr_s8(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, const unsigned int c) {
-  return v128_shl_16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, const unsigned int c) {
-  return v128_shr_u16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, const unsigned int c) {
-  return v128_shr_s16(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, const unsigned int c) {
-  return v128_shl_32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, const unsigned int c) {
-  return v128_shr_u32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, const unsigned int c) {
-  return v128_shr_s32(a, c);
-}
-
-#endif
-
-#endif /* _V128_INTRINSICS_H */
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -1,684 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V128_INTRINSICS_C_H
-#define _V128_INTRINSICS_C_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./v64_intrinsics_c.h"
-#include "./aom_config.h"
-
-typedef union {
-  uint8_t u8[16];
-  uint16_t u16[8];
-  uint32_t u32[4];
-  uint64_t u64[2];
-  int8_t s8[16];
-  int16_t s16[8];
-  int32_t s32[4];
-  int64_t s64[2];
-  c_v64 v64[2];
-} c_v128;
-
-SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
-
-SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
-
-SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
-
-SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
-  c_v128 t;
-  t.u64[1] = hi;
-  t.u64[0] = lo;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
-  c_v128 t;
-  t.v64[1] = hi;
-  t.v64[0] = lo;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
-                                  uint32_t d) {
-  c_v128 t;
-  t.u32[3] = a;
-  t.u32[2] = b;
-  t.u32[1] = c;
-  t.u32[0] = d;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
-  c_v128 t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 16; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
-  if (simd_check && (uintptr_t)p & 15) {
-    fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
-    abort();
-  }
-  return c_v128_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 16; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
-  if (simd_check && (uintptr_t)p & 15) {
-    fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
-    abort();
-  }
-  c_v128_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v128 c_v128_zero() {
-  c_v128 t;
-  t.u64[1] = t.u64[0] = 0;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
-  c_v128 t;
-  t.v64[1] = t.v64[0] = c_v64_dup_8(x);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
-  c_v128 t;
-  t.v64[1] = t.v64[0] = c_v64_dup_16(x);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
-  c_v128 t;
-  t.v64[1] = t.v64[0] = c_v64_dup_32(x);
-  return t;
-}
-
-SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
-  return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
-         c_v64_dotp_s16(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
-  return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
-}
-
-typedef uint32_t c_sad128_internal;
-
-SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v128_sad_u8_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
-                                            c_v128 b) {
-  int c;
-  for (c = 0; c < 16; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
-
-typedef uint32_t c_ssd128_internal;
-
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_u8_sum(). */
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
-                                            c_v128 b) {
-  int c;
-  for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
-
-SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
-                         c_v64_or(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
-                         c_v64_xor(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
-                         c_v64_and(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
-                         c_v64_andn(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
-                         c_v64_add_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
-                         c_v64_add_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
-                         c_v64_sadd_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
-                         c_v64_add_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
-  c_v128 t;
-  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
-  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
-  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
-  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
-                         c_v64_sub_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
-                         c_v64_ssub_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
-                         c_v64_ssub_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
-                         c_v64_sub_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
-                         c_v64_ssub_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
-                         c_v64_sub_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
-  c_v64 lo_bits = c_v64_mullo_s16(a, b);
-  c_v64 hi_bits = c_v64_mulhi_s16(a, b);
-  return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
-                         c_v64_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
-                         c_v64_mullo_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
-                         c_v64_mulhi_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
-                         c_v64_mullo_s32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
-                         c_v64_madd_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
-                         c_v64_madd_us8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
-                         c_v64_avg_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
-                         c_v64_rdavg_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
-                         c_v64_avg_u16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
-                         c_v64_min_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
-                         c_v64_max_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
-                         c_v64_min_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
-                         c_v64_max_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
-                         c_v64_min_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
-                         c_v64_max_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
-                         c_v64_ziplo_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
-                         c_v64_ziplo_8(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
-                         c_v64_ziplo_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
-                         c_v64_ziplo_16(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
-                         c_v64_ziplo_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
-                         c_v64_ziplo_32(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(a.v64[1], b.v64[1]);
-}
-
-SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
-  return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
-}
-
-SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
-  return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
-}
-
-SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
-  return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
-  c_v128 t;
-  if (mode) {
-    t.u8[15] = b.u8[15];
-    t.u8[14] = b.u8[13];
-    t.u8[13] = b.u8[11];
-    t.u8[12] = b.u8[9];
-    t.u8[11] = b.u8[7];
-    t.u8[10] = b.u8[5];
-    t.u8[9] = b.u8[3];
-    t.u8[8] = b.u8[1];
-    t.u8[7] = a.u8[15];
-    t.u8[6] = a.u8[13];
-    t.u8[5] = a.u8[11];
-    t.u8[4] = a.u8[9];
-    t.u8[3] = a.u8[7];
-    t.u8[2] = a.u8[5];
-    t.u8[1] = a.u8[3];
-    t.u8[0] = a.u8[1];
-  } else {
-    t.u8[15] = a.u8[14];
-    t.u8[14] = a.u8[12];
-    t.u8[13] = a.u8[10];
-    t.u8[12] = a.u8[8];
-    t.u8[11] = a.u8[6];
-    t.u8[10] = a.u8[4];
-    t.u8[9] = a.u8[2];
-    t.u8[8] = a.u8[0];
-    t.u8[7] = b.u8[14];
-    t.u8[6] = b.u8[12];
-    t.u8[5] = b.u8[10];
-    t.u8[4] = b.u8[8];
-    t.u8[3] = b.u8[6];
-    t.u8[2] = b.u8[4];
-    t.u8[1] = b.u8[2];
-    t.u8[0] = b.u8[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
-                           : _c_v128_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
-                           : _c_v128_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
-  c_v128 t;
-  if (mode) {
-    t.u16[7] = b.u16[7];
-    t.u16[6] = b.u16[5];
-    t.u16[5] = b.u16[3];
-    t.u16[4] = b.u16[1];
-    t.u16[3] = a.u16[7];
-    t.u16[2] = a.u16[5];
-    t.u16[1] = a.u16[3];
-    t.u16[0] = a.u16[1];
-  } else {
-    t.u16[7] = a.u16[6];
-    t.u16[6] = a.u16[4];
-    t.u16[5] = a.u16[2];
-    t.u16[4] = a.u16[0];
-    t.u16[3] = b.u16[6];
-    t.u16[2] = b.u16[4];
-    t.u16[1] = b.u16[2];
-    t.u16[0] = b.u16[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
-                           : _c_v128_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
-                           : _c_v128_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
-  c_v128 t;
-  if (mode) {
-    t.u32[3] = b.u32[3];
-    t.u32[2] = b.u32[1];
-    t.u32[1] = a.u32[3];
-    t.u32[0] = a.u32[1];
-  } else {
-    t.u32[3] = a.u32[2];
-    t.u32[2] = a.u32[0];
-    t.u32[1] = b.u32[2];
-    t.u32[0] = b.u32[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
-                           : _c_v128_unzip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
-                           : _c_v128_unzip_32(b, a, 1);
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
-                         c_v64_unpacklo_u8_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
-                         c_v64_unpacklo_u8_s16(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
-                         c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
-                         c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
-                         c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
-                         c_v64_unpacklo_u16_s32(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
-                         c_v64_unpacklo_s16_s32(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
-                         c_v64_unpacklo_u16_s32(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
-                         c_v64_unpacklo_s16_s32(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
-  c_v128 t;
-  int c;
-  for (c = 0; c < 16; c++) {
-    if (pattern.u8[c] & ~15) {
-      fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
-    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
-                                     : pattern.u8[c] & 15];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
-                         c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
-                         c_v64_cmplt_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
-                         c_v64_cmpeq_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
-                         c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
-                         c_v64_cmplt_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
-                         c_v64_cmpeq_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
-  if (n < 8)
-    return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
-                                    c_v64_shr_n_byte(a.v64[0], 8 - n)),
-                           c_v64_shl_n_byte(a.v64[0], n));
-  else
-    return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
-  if (n < 8)
-    return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
-                           c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
-                                    c_v64_shl_n_byte(a.v64[1], 8 - n)));
-  else
-    return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
-}
-
-SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
-  if (simd_check && c > 15) {
-    fprintf(stderr, "Error: undefined alignment %d\n", c);
-    abort();
-  }
-  return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
-           : b;
-}
-
-SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
-                         c_v64_shr_u16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
-                         c_v64_shr_s16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
-                         c_v64_shr_u32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
-                         c_v64_shr_s32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
-  return c_v128_shl_8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
-  return c_v128_shl_16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
-  return c_v128_shl_32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
-  return c_v128_shr_u8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
-  return c_v128_shr_u16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
-  return c_v128_shr_u32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
-  return c_v128_shr_s8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
-  return c_v128_shr_s16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
-  return c_v128_shr_s32(a, n);
-}
-
-#endif /* _V128_INTRINSICS_C_H */
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -1,488 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V128_INTRINSICS_H
-#define _V128_INTRINSICS_H
-
-#include "./v64_intrinsics_x86.h"
-
-typedef __m128i v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) {
-  return (uint32_t)_mm_cvtsi128_si32(a);
-}
-
-SIMD_INLINE v64 v128_low_v64(v128 a) {
-  return _mm_unpacklo_epi64(a, v64_zero());
-}
-
-SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
-
-SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
-  return _mm_unpacklo_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return v128_from_v64(v64_from_64(a), v64_from_64(b));
-}
-
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return _mm_set_epi32(a, b, c, d);
-}
-
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
-  return _mm_load_si128((__m128i *)p);
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-#if defined(__SSSE3__)
-  return (__m128i)_mm_lddqu_si128((__m128i *)p);
-#else
-  return _mm_loadu_si128((__m128i *)p);
-#endif
-}
-
-SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
-  _mm_store_si128((__m128i *)p, a);
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
-  _mm_storeu_si128((__m128i *)p, a);
-}
-
-// The following function requires an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if __OPTIMIZE__ && !__clang__
-#if defined(__SSSE3__)
-SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
-  return c ? _mm_alignr_epi8(a, b, c) : b;
-}
-#else
-#define v128_align(a, b, c) \
-  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
-#endif
-#else
-#if defined(__SSSE3__)
-#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b))
-#else
-#define v128_align(a, b, c) \
-  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
-#endif
-#endif
-
-SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
-
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
-
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
-
-SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
-
-SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
-
-SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
-
-SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
-
-SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
-
-SIMD_INLINE v128 v128_padd_s16(v128 a) {
-  return _mm_madd_epi16(a, _mm_set1_epi16(1));
-}
-
-SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
-
-SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
-
-SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
-
-SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
-
-SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
-
-SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
-
-SIMD_INLINE v128 v128_abs_s16(v128 a) {
-#if defined(__SSSE3__)
-  return _mm_abs_epi16(a);
-#else
-  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
-#endif
-}
-
-SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
-  return _mm_unpacklo_epi8(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
-  return _mm_unpackhi_epi8(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
-  return _mm_unpacklo_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
-  return _mm_unpackhi_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
-  return _mm_unpacklo_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
-  return _mm_unpackhi_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return _mm_unpacklo_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return _mm_unpackhi_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
-
-SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
-
-SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
-
-SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
-  return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
-}
-
-SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
-#if defined(__SSSE3__)
-#ifdef __x86_64__
-  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
-#else
-  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
-#endif
-  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
-                            _mm_shuffle_epi8(a, order));
-#else
-  return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
-  return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
-}
-
-SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
-#if defined(__SSSE3__)
-#ifdef __x86_64__
-  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
-#else
-  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
-#endif
-  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
-                            _mm_shuffle_epi8(a, order));
-#else
-  return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
-  return _mm_castps_si128(_mm_shuffle_ps(
-      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
-}
-
-SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
-  return _mm_castps_si128(_mm_shuffle_ps(
-      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
-}
-
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
-  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
-  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
-  return _mm_unpackhi_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
-  return _mm_packs_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
-  return _mm_packus_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
-  return _mm_packs_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
-  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
-  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
-  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
-  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
-  return _mm_unpackhi_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
-  return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(x, pattern);
-#else
-  v128 output;
-  unsigned char *input = (unsigned char *)&x;
-  unsigned char *index = (unsigned char *)&pattern;
-  char *selected = (char *)&output;
-  int counter;
-
-  for (counter = 0; counter < 16; counter++) {
-    selected[counter] = input[index[counter] & 15];
-  }
-
-  return output;
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
-  v128 r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__) && defined(__x86_64__)
-  v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
-                         _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
-  return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
-#else
-  return (int64_t)_mm_cvtsi128_si32(r) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
-#endif
-}
-
-SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
-  v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
-  return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
-}
-
-typedef v128 sad128_internal;
-
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v128_sad_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
-  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
-}
-
-typedef v128 ssd128_internal;
-
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_sum(). */
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                         _mm_unpacklo_epi8(b, _mm_setzero_si128()));
-  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
-                         _mm_unpackhi_epi8(b, _mm_setzero_si128()));
-  v128 rl = _mm_madd_epi16(l, l);
-  v128 rh = _mm_madd_epi16(h, h);
-  v128 c = _mm_cvtsi32_si128(32);
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8));
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4));
-  return _mm_add_epi64(
-      s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c));
-}
-
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
-}
-
-SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
-
-SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
-
-SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
-
-SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
-  v64 lo_bits = v64_mullo_s16(a, b);
-  v64 hi_bits = v64_mulhi_s16(a, b);
-  return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
-                       v64_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
-  return _mm_mullo_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-  return _mm_mulhi_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_mullo_epi32(a, b);
-#else
-  return _mm_unpacklo_epi32(
-      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
-      _mm_shuffle_epi32(
-          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
-#endif
-}
-
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
-
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-#if defined(__SSSE3__)
-  return _mm_maddubs_epi16(a, b);
-#else
-  return _mm_packs_epi32(
-      _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                     _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
-      _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
-                     _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
-#endif
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
-
-SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
-  return _mm_sub_epi8(_mm_avg_epu8(a, b),
-                      _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
-}
-
-SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
-
-SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
-
-SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
-
-SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_min_epi8(a, b);
-#else
-  v128 mask = _mm_cmplt_epi8(a, b);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_max_epi8(a, b);
-#else
-  v128 mask = _mm_cmplt_epi8(b, a);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
-
-SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
-  return _mm_cmpgt_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
-  return _mm_cmplt_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
-                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
-}
-
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
-#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
-#define v128_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
-#define v128_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
-#define v128_shr_n_s8(a, c)                                         \
-  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
-                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
-#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
-#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
-#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
-#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
-#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
-#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
-
-#endif /* _V128_INTRINSICS_H */
--- a/aom_dsp/simd/v256_intrinsics.h
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V256_INTRINSICS_H
-#define _V256_INTRINSICS_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "./v256_intrinsics_c.h"
-#include "./v128_intrinsics.h"
-#include "./v64_intrinsics.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v256 v256;
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
-SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
-SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
-SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
-SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
-  return c_v256_from_v128(hi, lo);
-}
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return c_v256_from_64(a, b, c, d);
-}
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
-  return c_v256_from_v64(a, b, c, d);
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
-  return c_v256_load_unaligned(p);
-}
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
-  return c_v256_load_aligned(p);
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  c_v256_store_unaligned(p, a);
-}
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  c_v256_store_aligned(p, a);
-}
-
-SIMD_INLINE v256 v256_align(v256 a, v256 b, const unsigned int c) {
-  return c_v256_align(a, b, c);
-}
-
-SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
-SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
-
-typedef uint32_t sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
-  return c_v256_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  return c_v256_sad_u8_sum(s);
-}
-typedef uint32_t ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
-  return c_v256_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  return c_v256_ssd_u8_sum(s);
-}
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  return c_v256_dotp_s16(a, b);
-}
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
-SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
-SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
-SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
-
-SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return c_v256_mullo_s16(a, b);
-}
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return c_v256_mulhi_s16(a, b);
-}
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return c_v256_mullo_s32(a, b);
-}
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
-
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return c_v256_ziplo_128(a, b);
-}
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return c_v256_ziphi_128(a, b);
-}
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return c_v256_unziplo_8(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return c_v256_unziphi_8(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return c_v256_unziplo_16(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return c_v256_unziphi_16(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return c_v256_unziplo_32(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return c_v256_unziphi_32(a, b);
-}
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return c_v256_unpacklo_u8_s16(a);
-}
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return c_v256_unpackhi_u8_s16(a);
-}
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return c_v256_pack_s32_s16(a, b);
-}
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return c_v256_pack_s16_u8(a, b);
-}
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return c_v256_pack_s16_s8(a, b);
-}
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return c_v256_unpack_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return c_v256_unpack_s16_s32(a);
-}
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return c_v256_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return c_v256_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return c_v256_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return c_v256_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  return c_v256_shuffle_8(a, pattern);
-}
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return c_v256_pshuffle_8(a, pattern);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return c_v256_cmpgt_s16(a, b);
-}
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return c_v256_cmplt_s16(a, b);
-}
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
-
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return c_v256_shl_8(a, c);
-}
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return c_v256_shr_u8(a, c);
-}
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  return c_v256_shr_s8(a, c);
-}
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return c_v256_shl_16(a, c);
-}
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return c_v256_shr_u16(a, c);
-}
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return c_v256_shr_s16(a, c);
-}
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return c_v256_shl_32(a, c);
-}
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return c_v256_shr_u32(a, c);
-}
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return c_v256_shr_s32(a, c);
-}
-
-SIMD_INLINE v256 v256_shr_n_byte(v256 a, const unsigned int n) {
-  return c_v256_shr_n_byte(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_byte(v256 a, const unsigned int n) {
-  return c_v256_shl_n_byte(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_8(v256 a, const unsigned int n) {
-  return c_v256_shl_n_8(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_16(v256 a, const unsigned int n) {
-  return c_v256_shl_n_16(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_32(v256 a, const unsigned int n) {
-  return c_v256_shl_n_32(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u8(v256 a, const unsigned int n) {
-  return c_v256_shr_n_u8(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u16(v256 a, const unsigned int n) {
-  return c_v256_shr_n_u16(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u32(v256 a, const unsigned int n) {
-  return c_v256_shr_n_u32(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s8(v256 a, const unsigned int n) {
-  return c_v256_shr_n_s8(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s16(v256 a, const unsigned int n) {
-  return c_v256_shr_n_s16(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s32(v256 a, const unsigned int n) {
-  return c_v256_shr_n_s32(a, n);
-}
-
-#endif /* _V256_INTRINSICS_H */
--- a/aom_dsp/simd/v256_intrinsics_arm.h
+++ b/aom_dsp/simd/v256_intrinsics_arm.h
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V256_INTRINSICS_H
-#define _V256_INTRINSICS_H
-
-#include "./v256_intrinsics_v128.h"
-
-#endif /* _V256_INTRINSICS_H */
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -1,701 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V256_INTRINSICS_C_H
-#define _V256_INTRINSICS_C_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./v128_intrinsics_c.h"
-#include "./aom_config.h"
-
-typedef union {
-  uint8_t u8[32];
-  uint16_t u16[16];
-  uint32_t u32[8];
-  uint64_t u64[4];
-  int8_t s8[32];
-  int16_t s16[16];
-  int32_t s32[8];
-  int64_t s64[4];
-  c_v64 v64[4];
-  c_v128 v128[2];
-} c_v256;
-
-SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
-
-SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
-
-SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
-
-SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
-
-SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
-  c_v256 t;
-  t.v128[1] = hi;
-  t.v128[0] = lo;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
-                                  uint64_t d) {
-  c_v256 t;
-  t.u64[3] = a;
-  t.u64[2] = b;
-  t.u64[1] = c;
-  t.u64[0] = d;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
-  c_v256 t;
-  t.u64[3] = a.u64;
-  t.u64[2] = b.u64;
-  t.u64[1] = c.u64;
-  t.u64[0] = d.u64;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
-  c_v256 t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 32; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
-  if (simd_check && (uintptr_t)p & 31) {
-    fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
-    abort();
-  }
-  return c_v256_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 32; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
-  if (simd_check && (uintptr_t)p & 31) {
-    fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
-    abort();
-  }
-  c_v256_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v256 c_v256_zero() {
-  c_v256 t;
-  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
-  c_v256 t;
-  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
-  c_v256 t;
-  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
-  c_v256 t;
-  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
-  return t;
-}
-
-SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
-  return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
-         c_v128_dotp_s16(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
-  return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
-}
-
-typedef uint32_t c_sad256_internal;
-
-SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_u8_sum().
-   The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
-                                            c_v256 b) {
-  int c;
-  for (c = 0; c < 32; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
-
-typedef uint32_t c_ssd256_internal;
-
-SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
-                                            c_v256 b) {
-  int c;
-  for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
-
-SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
-                          c_v128_or(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
-                          c_v128_xor(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
-                          c_v128_and(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
-                          c_v128_andn(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
-                          c_v128_add_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
-                          c_v128_add_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
-                          c_v128_sadd_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
-                          c_v128_add_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
-  c_v256 t;
-  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
-  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
-  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
-  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
-  t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
-  t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
-  t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
-  t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
-                          c_v128_sub_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
-                          c_v128_ssub_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
-                          c_v128_ssub_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
-                          c_v128_sub_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
-                          c_v128_ssub_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
-                          c_v128_sub_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
-  c_v128 lo_bits = c_v128_mullo_s16(a, b);
-  c_v128 hi_bits = c_v128_mulhi_s16(a, b);
-  return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
-                          c_v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
-                          c_v128_mullo_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
-                          c_v128_mulhi_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
-                          c_v128_mullo_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
-                          c_v128_madd_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
-                          c_v128_madd_us8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
-                          c_v128_avg_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
-                          c_v128_rdavg_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
-                          c_v128_avg_u16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
-                          c_v128_min_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
-                          c_v128_max_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
-                          c_v128_min_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
-                          c_v128_max_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
-                          c_v128_min_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
-                          c_v128_max_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_8(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_16(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_32(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_64(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_64(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(a.v128[1], b.v128[1]);
-}
-
-SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
-  return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
-  return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
-  return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
-  c_v256 t;
-  int i;
-  if (mode) {
-    for (i = 0; i < 16; i++) {
-      t.u8[i] = a.u8[i * 2 + 1];
-      t.u8[i + 16] = b.u8[i * 2 + 1];
-    }
-  } else {
-    for (i = 0; i < 16; i++) {
-      t.u8[i] = b.u8[i * 2];
-      t.u8[i + 16] = a.u8[i * 2];
-    }
-  }
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
-                           : _c_v256_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
-                           : _c_v256_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
-  c_v256 t;
-  int i;
-  if (mode) {
-    for (i = 0; i < 8; i++) {
-      t.u16[i] = a.u16[i * 2 + 1];
-      t.u16[i + 8] = b.u16[i * 2 + 1];
-    }
-  } else {
-    for (i = 0; i < 8; i++) {
-      t.u16[i] = b.u16[i * 2];
-      t.u16[i + 8] = a.u16[i * 2];
-    }
-  }
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
-                           : _c_v256_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
-                           : _c_v256_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
-  c_v256 t;
-  if (mode) {
-    t.u32[7] = b.u32[7];
-    t.u32[6] = b.u32[5];
-    t.u32[5] = b.u32[3];
-    t.u32[4] = b.u32[1];
-    t.u32[3] = a.u32[7];
-    t.u32[2] = a.u32[5];
-    t.u32[1] = a.u32[3];
-    t.u32[0] = a.u32[1];
-  } else {
-    t.u32[7] = a.u32[6];
-    t.u32[6] = a.u32[4];
-    t.u32[5] = a.u32[2];
-    t.u32[4] = a.u32[0];
-    t.u32[3] = b.u32[6];
-    t.u32[2] = b.u32[4];
-    t.u32[1] = b.u32[2];
-    t.u32[0] = b.u32[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
-                           : _c_v256_unzip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
-                           : _c_v256_unzip_32(b, a, 1);
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
-                          c_v128_unpacklo_u8_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
-                          c_v128_unpacklo_u8_s16(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
-                          c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
-                          c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
-                          c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
-                          c_v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
-                          c_v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
-                          c_v128_unpacklo_u16_s32(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
-                          c_v128_unpacklo_s16_s32(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
-                          c_v128_unpacklo_u16_s32(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
-                          c_v128_unpacklo_s16_s32(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
-  c_v256 t;
-  int c;
-  for (c = 0; c < 32; c++) {
-    if (pattern.u8[c] & ~31) {
-      fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
-    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
-                                     : pattern.u8[c] & 31];
-  }
-  return t;
-}
-
-// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
-SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
-  return c_v256_from_v128(
-      c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
-      c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
-                          c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
-                          c_v128_cmplt_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
-                          c_v128_cmpeq_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
-                          c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
-                          c_v128_cmplt_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
-                          c_v128_cmpeq_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, const unsigned int n) {
-  if (n < 16)
-    return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
-                                      c_v128_shr_n_byte(a.v128[0], 16 - n)),
-                            c_v128_shl_n_byte(a.v128[0], n));
-  else if (n > 16)
-    return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
-                            c_v128_zero());
-  else
-    return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, const unsigned int n) {
-  if (n < 16)
-    return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
-                            c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
-                                      c_v128_shl_n_byte(a.v128[1], 16 - n)));
-  else if (n > 16)
-    return c_v256_from_v128(c_v128_zero(),
-                            c_v128_shr_n_byte(a.v128[1], n - 16));
-  else
-    return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
-}
-
-SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, const unsigned int c) {
-  if (simd_check && c > 31) {
-    fprintf(stderr, "Error: undefined alignment %d\n", c);
-    abort();
-  }
-  return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
-           : b;
-}
-
-SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
-                          c_v128_shl_8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
-                          c_v128_shr_u8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
-                          c_v128_shr_s8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
-                          c_v128_shl_16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
-                          c_v128_shr_u16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
-                          c_v128_shr_s16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
-                          c_v128_shl_32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
-                          c_v128_shr_u32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, const unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
-                          c_v128_shr_s32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, const unsigned int n) {
-  return c_v256_shl_8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, const unsigned int n) {
-  return c_v256_shl_16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, const unsigned int n) {
-  return c_v256_shl_32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, const unsigned int n) {
-  return c_v256_shr_u8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, const unsigned int n) {
-  return c_v256_shr_u16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, const unsigned int n) {
-  return c_v256_shr_u32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, const unsigned int n) {
-  return c_v256_shr_s8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, const unsigned int n) {
-  return c_v256_shr_s16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, const unsigned int n) {
-  return c_v256_shr_s32(a, n);
-}
-
-#endif /* _V256_INTRINSICS_C_H */
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -1,525 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V256_INTRINSICS_V128_H
-#define _V256_INTRINSICS_V128_H
-
-#if HAVE_NEON
-#include "./v128_intrinsics_arm.h"
-#elif HAVE_SSE2
-#include "./v128_intrinsics_x86.h"
-#else
-#include "./v128_intrinsics.h"
-#endif
-
-typedef struct { v128 lo, hi; } v256;
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
-
-SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
-
-SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
-
-SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
-
-SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
-  v256 t;
-  t.hi = hi;
-  t.lo = lo;
-  return t;
-}
-
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
-}
-
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
-  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
-  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
-                        v128_load_unaligned(p));
-}
-
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
-  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
-                        v128_load_aligned(p));
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  v128_store_unaligned(p, a.lo);
-  v128_store_unaligned((uint8_t *)p + 16, a.hi);
-}
-
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  v128_store_aligned(p, a.lo);
-  v128_store_aligned((uint8_t *)p + 16, a.hi);
-}
-
-SIMD_INLINE v256 v256_zero() {
-  return v256_from_v128(v128_zero(), v128_zero());
-}
-
-SIMD_INLINE v256 v256_dup_8(uint8_t x) {
-  v128 t = v128_dup_8(x);
-  return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_16(uint16_t x) {
-  v128 t = v128_dup_16(x);
-  return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_32(uint32_t x) {
-  v128 t = v128_dup_32(x);
-  return v256_from_v128(t, t);
-}
-
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
-}
-
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
-  return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
-}
-
-typedef struct {
-  sad128_internal hi;
-  sad128_internal lo;
-} sad256_internal;
-
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
-  sad256_internal t;
-  t.hi = v128_sad_u8_init();
-  t.lo = v128_sad_u8_init();
-  return t;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_u8_sum().
-   The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
-  sad256_internal t;
-  t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
-  return t;
-}
-
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
-}
-
-typedef struct {
-  ssd128_internal hi;
-  ssd128_internal lo;
-} ssd256_internal;
-
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
-  ssd256_internal t;
-  t.hi = v128_ssd_u8_init();
-  t.lo = v128_ssd_u8_init();
-  return t;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
-  ssd256_internal t;
-  t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
-  return t;
-}
-
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
-}
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) {
-  return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
-  return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_and(v256 a, v256 b) {
-  return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
-  return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
-  return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
-  return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
-  return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_padd_s16(v256 a) {
-  return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
-}
-
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_abs_s16(v256 a) {
-  return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
-}
-
-SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
-  v128 lo_bits = v128_mullo_s16(a, b);
-  v128 hi_bits = v128_mulhi_s16(a, b);
-  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
-                        v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
-}
-
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
-}
-
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
-}
-
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
-}
-
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(a.lo, b.lo);
-}
-
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(a.hi, b.hi);
-}
-
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
-                        v128_unziplo_16(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
-                        v128_unziphi_16(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
-                        v128_unziplo_32(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
-                        v128_unziphi_32(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
-}
-
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
-                        v128_pack_s32_s16(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
-                        v128_pack_s16_u8(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
-                        v128_pack_s16_s8(b.hi, b.lo));
-}
-
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
-                        v128_unpacklo_u16_s32(a.lo));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
-                        v128_unpacklo_s16_s32(a.lo));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
-                        v128_unpacklo_u16_s32(a.hi));
-}
-
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
-                        v128_unpacklo_s16_s32(a.hi));
-}
-
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
-  v128 masklo = v128_cmplt_s8(pattern.lo, c16);
-  return v256_from_v128(
-      v128_or(
-          v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
-          v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
-      v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
-              v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
-                        masklo)));
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return v256_from_v128(
-      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
-      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
-}
-
-SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define v256_shl_n_byte(a, n)                                                 \
-  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n),                \
-                                     v128_shr_n_byte(a.lo, 16 - (n))),        \
-                             v128_shl_n_byte(a.lo, (n)))                      \
-            : v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \
-                             v128_zero()))
-
-#define v256_shr_n_byte(a, n)                                          \
-  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n),                 \
-                             v128_or(v128_shr_n_byte(a.lo, n),         \
-                                     v128_shl_n_byte(a.hi, 16 - (n)))) \
-            : v256_from_v128(v128_zero(),                              \
-                             (n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi))
-
-#define v256_align(a, b, c) \
-  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
-
-#define v256_shl_n_8(a, n) \
-  v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
-#define v256_shl_n_16(a, n) \
-  v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
-#define v256_shl_n_32(a, n) \
-  v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
-#define v256_shr_n_u8(a, n) \
-  v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
-#define v256_shr_n_u16(a, n) \
-  v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
-#define v256_shr_n_u32(a, n) \
-  v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
-#define v256_shr_n_s8(a, n) \
-  v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
-#define v256_shr_n_s16(a, n) \
-  v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
-#define v256_shr_n_s32(a, n) \
-  v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
-
-#endif /* _V256_INTRINSICS_V128_H */
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -1,528 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V256_INTRINSICS_H
-#define _V256_INTRINSICS_H
-
-#if !defined(__AVX2__)
-
-#include "./v256_intrinsics_v128.h"
-
-#else
-
-// The _m256i type seems to cause problems for g++'s mangling prior to
-// version 5, but adding -fabi-version=0 fixes this.
-#if !defined(__clang__) && __GNUC__ < 5 && defined(__AVX2__) && \
-    defined(__cplusplus)
-#pragma GCC optimize "-fabi-version=0"
-#endif
-
-#include <immintrin.h>
-#include "./v128_intrinsics_x86.h"
-
-typedef __m256i v256;
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) {
-  return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
-}
-
-SIMD_INLINE v64 v256_low_v64(v256 a) {
-  return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
-}
-
-SIMD_INLINE v128 v256_low_v128(v256 a) {
-  return _mm256_extracti128_si256(a, 0);
-}
-
-SIMD_INLINE v128 v256_high_v128(v256 a) {
-  return _mm256_extracti128_si256(a, 1);
-}
-
-SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
-  // gcc seems to be missing _mm256_set_m128i()
-  return _mm256_insertf128_si256(
-      _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
-}
-
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
-  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
-}
-
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
-}
-
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
-  return _mm256_load_si256((const __m256i *)p);
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
-  return _mm256_loadu_si256((const __m256i *)p);
-}
-
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  _mm256_store_si256((__m256i *)p, a);
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  _mm256_storeu_si256((__m256i *)p, a);
-}
-
-SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
-
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
-
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
-
-SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
-
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
-
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
-  return _mm256_adds_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
-
-SIMD_INLINE v256 v256_padd_s16(v256 a) {
-  return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
-}
-
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
-
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
-
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
-
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
-
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
-  return _mm256_subs_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
-
-SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
-
-// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
-// lanes of lower or upper halves of a 256bit vector because the
-// unpack/pack intrinsics operate on the 256 bit input vector as 2
-// independent 128 bit vectors.
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
-}
-
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
-}
-
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
-}
-
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
-                        v128_unpacklo_u8_s16(v256_low_v128(a)));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
-                        v128_unpacklo_u8_s16(v256_high_v128(a)));
-}
-
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
-}
-
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
-                        v128_unpacklo_u16_s32(v256_low_v128(a)));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
-                        v128_unpacklo_s16_s32(v256_low_v128(a)));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
-                        v128_unpacklo_u16_s32(v256_high_v128(a)));
-}
-
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
-                        v128_unpacklo_s16_s32(v256_high_v128(a)));
-}
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 hi = v256_high_v128(pattern);
-  v128 lo = v256_low_v128(pattern);
-  v128 maskhi = v128_cmplt_s8(hi, c16);
-  v128 masklo = v128_cmplt_s8(lo, c16);
-  return v256_from_v128(
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
-                        maskhi)),
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
-                        masklo)));
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return _mm256_shuffle_epi8(a, pattern);
-}
-
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  v256 r = _mm256_madd_epi16(a, b);
-#if defined(__x86_64__)
-  v128 t;
-  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
-                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
-  t = v256_low_v128(_mm256_add_epi64(
-      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
-  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
-#else
-  v128 l = v256_low_v128(r);
-  v128 h = v256_high_v128(r);
-  return (int64_t)_mm_cvtsi128_si32(l) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
-         (int64_t)_mm_cvtsi128_si32(h) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
-#endif
-}
-
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
-  v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
-  v128 lo = v256_low_v128(t);
-  v128 hi = v256_high_v128(t);
-  lo = v128_add_32(lo, hi);
-  return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
-}
-
-typedef v256 sad256_internal;
-
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
-  return _mm256_setzero_si256();
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_sum().
-   The result for more than 32 v256_sad_u8() calls is undefined. */
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
-  return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
-  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
-}
-
-typedef v256 ssd256_internal;
-
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
-  return _mm256_setzero_si256();
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_sum(). */
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
-  v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
-                            _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
-  v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
-                            _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
-  v256 rl = _mm256_madd_epi16(l, l);
-  v256 rh = _mm256_madd_epi16(h, h);
-  v128 c = _mm_cvtsi32_si128(32);
-  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
-  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
-  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
-  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
-  return _mm256_add_epi64(
-      s,
-      _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
-}
-
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
-  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
-}
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
-
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
-
-SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
-
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
-
-SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
-  v128 lo_bits = v128_mullo_s16(a, b);
-  v128 hi_bits = v128_mulhi_s16(a, b);
-  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
-                        v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return _mm256_mullo_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return _mm256_mulhi_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return _mm256_mullo_epi32(a, b);
-}
-
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
-  return _mm256_madd_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
-  return _mm256_maddubs_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
-
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
-  return _mm256_sub_epi8(
-      _mm256_avg_epu8(a, b),
-      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
-}
-
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
-
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
-
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
-
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
-
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
-
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
-
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
-  return _mm256_cmpgt_epi8(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
-}
-
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
-  return _mm256_cmpeq_epi8(a, b);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return _mm256_cmpgt_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
-}
-
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
-  return _mm256_cmpeq_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
-                          _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
-                          _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
-                            _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
-}
-
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-// _mm256_slli_si256 works on 128 bit lanes and can't be used
-#define v256_shl_n_byte(a, n)                                                 \
-  ((n) < 16                                                                   \
-       ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n),        \
-                                v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \
-                        v128_shl_n_byte(v256_low_v128(a), n))                 \
-       : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16),            \
-                        v128_zero()))
-
-// _mm256_srli_si256 works on 128 bit lanes and can't be used
-#define v256_shr_n_byte(a, n)                                                 \
-  ((n) < 16                                                                   \
-       ? _mm256_alignr_epi8(                                                  \
-             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n)  \
-       : ((n) > 16                                                            \
-              ? _mm256_srli_si256(                                            \
-                    _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \
-                    (n)-16)                                                   \
-              : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1))))
-
-// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
-#define v256_align(a, b, c) \
-  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
-
-#define v256_shl_n_8(a, c)                                   \
-  _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
-                   _mm256_slli_epi16(a, c))
-#define v256_shr_n_u8(a, c) \
-  _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
-#define v256_shr_n_s8(a, c)                                                  \
-  _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
-                     _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
-#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
-#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
-#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
-#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
-#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
-#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
-#endif
-
-#endif /* _V256_INTRINSICS_H */
--- a/aom_dsp/simd/v64_intrinsics.h
+++ b/aom_dsp/simd/v64_intrinsics.h
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V64_INTRINSICS_H
-#define _V64_INTRINSICS_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./v64_intrinsics_c.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v64 v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
-SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
-SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
-SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return c_v64_from_32(x, y);
-}
-SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
-SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
-  return c_v64_from_16(a, b, c, d);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
-  return c_u32_load_unaligned(p);
-}
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
-  return c_u32_load_aligned(p);
-}
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-  c_u32_store_unaligned(p, a);
-}
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
-  c_u32_store_aligned(p, a);
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
-  return c_v64_load_unaligned(p);
-}
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
-  return c_v64_load_aligned(p);
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
-  c_v64_store_unaligned(p, a);
-}
-SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
-  c_v64_store_aligned(p, a);
-}
-
-SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) {
-  return c_v64_align(a, b, c);
-}
-
-SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
-SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
-
-SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
-SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
-SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
-SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
-SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
-SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
-SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
-SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
-SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
-SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
-SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
-
-SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
-SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
-SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
-SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
-SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
-SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
-SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
-SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
-SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
-SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
-SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
-  return c_v64_pack_s32_s16(a, b);
-}
-SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
-  return c_v64_pack_s16_u8(a, b);
-}
-SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
-  return c_v64_pack_s16_s8(a, b);
-}
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
-  return c_v64_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
-  return c_v64_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
-  return c_v64_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
-  return c_v64_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
-  return c_v64_shuffle_8(a, pattern);
-}
-
-typedef uint32_t sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
-  return c_v64_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
-  return c_v64_sad_u8_sum(s);
-}
-typedef uint32_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
-  return c_v64_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-  return c_v64_ssd_u8_sum(s);
-}
-SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
-SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
-SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
-
-SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
-SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
-SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
-SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
-
-SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
-SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
-SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
-SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
-SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
-
-SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
-SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
-SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
-SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
-SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
-SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
-SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
-SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
-SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
-SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
-SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
-SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
-SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
-SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
-  return c_v64_shr_u16(a, n);
-}
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
-  return c_v64_shr_s16(a, n);
-}
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
-  return c_v64_shr_u32(a, n);
-}
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
-  return c_v64_shr_s32(a, n);
-}
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, const unsigned int n) {
-  return c_v64_shr_n_byte(a, n);
-}
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int n) {
-  return c_v64_shl_n_byte(a, n);
-}
-SIMD_INLINE v64 v64_shl_n_8(v64 a, const unsigned int c) {
-  return c_v64_shl_n_8(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, const unsigned int c) {
-  return c_v64_shr_n_u8(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, const unsigned int c) {
-  return c_v64_shr_n_s8(a, c);
-}
-SIMD_INLINE v64 v64_shl_n_16(v64 a, const unsigned int c) {
-  return c_v64_shl_n_16(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, const unsigned int c) {
-  return c_v64_shr_n_u16(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, const unsigned int c) {
-  return c_v64_shr_n_s16(a, c);
-}
-SIMD_INLINE v64 v64_shl_n_32(v64 a, const unsigned int c) {
-  return c_v64_shl_n_32(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, const unsigned int c) {
-  return c_v64_shr_n_u32(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, const unsigned int c) {
-  return c_v64_shr_n_s32(a, c);
-}
-
-#endif /* _V64_INTRINSICS_H */
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -1,578 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V64_INTRINSICS_H
-#define _V64_INTRINSICS_H
-
-#include <arm_neon.h>
-#include "./v64_intrinsics_arm.h"
-#include "aom_ports/arm.h"
-
-#ifdef AOM_INCOMPATIBLE_GCC
-#error Incompatible gcc
-#endif
-
-typedef int64x1_t v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) {
-  return vget_lane_u32(vreinterpret_u32_s64(a), 0);
-}
-
-SIMD_INLINE uint32_t v64_high_u32(v64 a) {
-  return vget_lane_u32(vreinterpret_u32_s64(a), 1);
-}
-
-SIMD_INLINE int32_t v64_low_s32(v64 a) {
-  return vget_lane_s32(vreinterpret_s32_s64(a), 0);
-}
-
-SIMD_INLINE int32_t v64_high_s32(v64 a) {
-  return vget_lane_s32(vreinterpret_s32_s64(a), 1);
-}
-
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
-  return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
-                     d);
-}
-
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return vcreate_s64((uint64_t)x << 32 | y);
-}
-
-SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
-
-SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
-
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
-  return *((uint32_t *)p);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
-  return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
-}
-
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
-  *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if __clang__
-  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
-                0);
-#elif __CC_ARM
-  *(__packed uint32_t *)p) = a;
-#elif __GNUC__
-  *((__attribute((packed)) uint32_t *)p) = a;
-#else
-  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
-                0);
-#endif
-}
-
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
-  return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
-  return v64_load_aligned(p);
-}
-
-SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
-  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
-  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-// The following function requires an immediate.
-// Some compilers will check this if it's optimising, others wont.
-SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) {
-#if __OPTIMIZE__ && !__clang__
-  return c ? vreinterpret_s64_s8(
-                 vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
-           : b;
-#else
-  return c ? v64_from_64((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)
-           : b;
-#endif
-}
-
-SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
-
-SIMD_INLINE v64 v64_ones() { return vreinterpret_s64_u8(vdup_n_u8(-1)); }
-
-SIMD_INLINE v64 v64_dup_8(uint8_t x) {
-  return vreinterpret_s64_u8(vdup_n_u8(x));
-}
-
-SIMD_INLINE v64 v64_dup_16(uint16_t x) {
-  return vreinterpret_s64_u16(vdup_n_u16(x));
-}
-
-SIMD_INLINE v64 v64_dup_32(uint32_t x) {
-  return vreinterpret_s64_u32(vdup_n_u32(x));
-}
-
-SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
-  int64x2_t r = vpaddlq_s32(vpaddlq_s16(
-      vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
-                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
-  return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
-}
-
-SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
-  int64x2_t r =
-      vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
-}
-
-SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
-  return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
-}
-
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
-}
-
-typedef uint16x8_t sad64_internal;
-
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
-  return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-}
-
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
-  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
-  return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
-}
-
-typedef int64x1_t ssd64_internal;
-
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
-  return (ssd64_internal)(uint64_t)0;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
-  uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
-  return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
-}
-
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-  return (uint32_t)(uint64_t)s;
-}
-
-SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
-
-SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
-
-SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
-
-SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
-
-SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
-  return vreinterpret_s64_u32(
-      vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
-  return vreinterpret_s64_s32(
-      vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_abs_s16(v64 x) {
-  return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
-}
-
-SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
-      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
-}
-
-SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
-  return vreinterpret_s64_s32(
-      vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
-  int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
-  return vreinterpret_s64_s32(
-      vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
-                vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
-}
-
-SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
-      vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
-                         vreinterpret_s8_s64(y)),
-                vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
-}
-
-SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[0]);
-}
-
-SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[1]);
-}
-
-SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
-  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
-  return vreinterpret_s64_s16(r.val[0]);
-}
-
-SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
-  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
-  return vreinterpret_s64_s16(r.val[1]);
-}
-
-SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
-  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
-  return vreinterpret_s64_s32(r.val[0]);
-}
-
-SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
-  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
-  return vreinterpret_s64_s32(r.val[1]);
-}
-
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
-  return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
-  return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vqmovn_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
-  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[0]);
-}
-
-SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
-  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[1]);
-}
-
-SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
-  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpret_s64_u16(r.val[0]);
-}
-
-SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
-  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpret_s64_u16(r.val[1]);
-}
-
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
-  return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
-  return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
-  return vreinterpret_s64_s32(
-      vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
-  return vreinterpret_s64_u32(
-      vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
-  return vreinterpret_s64_u8(
-      vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
-}
-
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
-  return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
-}
-
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(
-      vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
-  return vreinterpret_s64_s16(
-      vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(
-      vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
-  return vreinterpret_s64_s32(
-      vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
-}
-
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if __OPTIMIZE__ && !__clang__
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) {
-  return vshl_n_s64(a, c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, const unsigned int c) {
-  return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, const unsigned int c) {
-  return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, const unsigned int c) {
-  return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, const unsigned int c) {
-  return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, const unsigned int c) {
-  return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, const unsigned int c) {
-  return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, const unsigned int c) {
-  return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, const unsigned int c) {
-  return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, const unsigned int c) {
-  return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, const unsigned int c) {
-  return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
-}
-
-#else
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) {
-  return v64_from_64(v64_u64(a) << c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, const unsigned int c) {
-  return v64_from_64(v64_u64(a) >> c * 8);
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, const unsigned int c) {
-  return v64_shl_8(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, const unsigned int c) {
-  return v64_shr_u8(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, const unsigned int c) {
-  return v64_shr_s8(a, c);
-}
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, const unsigned int c) {
-  return v64_shl_16(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, const unsigned int c) {
-  return v64_shr_u16(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, const unsigned int c) {
-  return v64_shr_s16(a, c);
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, const unsigned int c) {
-  return v64_shl_32(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, const unsigned int c) {
-  return v64_shr_u32(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, const unsigned int c) {
-  return v64_shr_s32(a, c);
-}
-
-#endif
-
-#endif /* _V64_INTRINSICS_H */
--- a/aom_dsp/simd/v64_intrinsics_c.h
+++ b/aom_dsp/simd/v64_intrinsics_c.h
@@ -1,887 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V64_INTRINSICS_C_H
-#define _V64_INTRINSICS_C_H
-
-/* Note: This implements the intrinsics in plain, unoptimised C.
-   Intended for reference, porting or debugging. */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "./aom_config.h"
-
-extern const int simd_check;
-
-typedef union {
-  uint8_t u8[8];
-  uint16_t u16[4];
-  uint32_t u32[2];
-  uint64_t u64;
-  int8_t s8[8];
-  int16_t s16[4];
-  int32_t s32[2];
-  int64_t s64;
-} c_v64;
-
-SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
-
-SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
-  return a.u32[!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
-
-SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
-  return a.s32[!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
-  c_v64 t;
-  t.u32[!CONFIG_BIG_ENDIAN] = x;
-  t.u32[CONFIG_BIG_ENDIAN] = y;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
-  c_v64 t;
-  t.u64 = x;
-  return t;
-}
-
-SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
-
-SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
-                                uint16_t d) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    t.u16[0] = a;
-    t.u16[1] = b;
-    t.u16[2] = c;
-    t.u16[3] = d;
-  } else {
-    t.u16[3] = a;
-    t.u16[2] = b;
-    t.u16[1] = c;
-    t.u16[0] = d;
-  }
-  return t;
-}
-
-SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
-  uint32_t t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 4; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 4; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
-  if (simd_check && (uintptr_t)p & 3) {
-    fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
-    abort();
-  }
-  return c_u32_load_unaligned(p);
-}
-
-SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
-  if (simd_check && (uintptr_t)p & 3) {
-    fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
-    abort();
-  }
-  c_u32_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
-  c_v64 t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 8; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
-  if (simd_check && (uintptr_t)p & 7) {
-    fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
-    abort();
-  }
-  return c_v64_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
-  uint8_t *q = (uint8_t *)p;
-  uint8_t *r = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 8; c++) q[c] = r[c];
-}
-
-SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
-  if (simd_check && (uintptr_t)p & 7) {
-    fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
-    abort();
-  }
-  c_v64_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v64 c_v64_zero() {
-  c_v64 t;
-  t.u64 = 0;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
-  c_v64 t;
-  t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
-      t.u8[7] = x;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
-  c_v64 t;
-  t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
-  c_v64 t;
-  t.u32[0] = t.u32[1] = x;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++)
-    t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
-                   ? 32767
-                   : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
-                         ? -32768
-                         : (int32_t)a.s16[c] + (int32_t)b.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u32[0] = a.u32[0] + b.u32[0];
-  t.u32[1] = a.u32[1] + b.u32[1];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++)
-    t.u8[c] = (int32_t)((uint32_t)a.u8[c] - (uint32_t)b.u8[c]) < 0
-                  ? 0
-                  : a.u8[c] - b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) {
-    int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
-    t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++)
-    t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
-                   ? -32768
-                   : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
-                         ? 32767
-                         : (int32_t)a.s16[c] - (int32_t)b.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u32[0] = a.u32[0] - b.u32[0];
-  t.u32[1] = a.u32[1] - b.u32[1];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++)
-    t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u8[7] = a.u8[7];
-    t.u8[6] = b.u8[7];
-    t.u8[5] = a.u8[6];
-    t.u8[4] = b.u8[6];
-    t.u8[3] = a.u8[5];
-    t.u8[2] = b.u8[5];
-    t.u8[1] = a.u8[4];
-    t.u8[0] = b.u8[4];
-  } else {
-    t.u8[7] = a.u8[3];
-    t.u8[6] = b.u8[3];
-    t.u8[5] = a.u8[2];
-    t.u8[4] = b.u8[2];
-    t.u8[3] = a.u8[1];
-    t.u8[2] = b.u8[1];
-    t.u8[1] = a.u8[0];
-    t.u8[0] = b.u8[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u16[3] = a.u16[3];
-    t.u16[2] = b.u16[3];
-    t.u16[1] = a.u16[2];
-    t.u16[0] = b.u16[2];
-  } else {
-    t.u16[3] = a.u16[1];
-    t.u16[2] = b.u16[1];
-    t.u16[1] = a.u16[0];
-    t.u16[0] = b.u16[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u32[1] = a.u32[1];
-    t.u32[0] = b.u32[1];
-  } else {
-    t.u32[1] = a.u32[0];
-    t.u32[0] = b.u32[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u8[7] = b.u8[7];
-    t.u8[6] = b.u8[5];
-    t.u8[5] = b.u8[3];
-    t.u8[4] = b.u8[1];
-    t.u8[3] = a.u8[7];
-    t.u8[2] = a.u8[5];
-    t.u8[1] = a.u8[3];
-    t.u8[0] = a.u8[1];
-  } else {
-    t.u8[7] = a.u8[6];
-    t.u8[6] = a.u8[4];
-    t.u8[5] = a.u8[2];
-    t.u8[4] = a.u8[0];
-    t.u8[3] = b.u8[6];
-    t.u8[2] = b.u8[4];
-    t.u8[1] = b.u8[2];
-    t.u8[0] = b.u8[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u16[3] = b.u16[3];
-    t.u16[2] = b.u16[1];
-    t.u16[1] = a.u16[3];
-    t.u16[0] = a.u16[1];
-  } else {
-    t.u16[3] = a.u16[2];
-    t.u16[2] = a.u16[0];
-    t.u16[1] = b.u16[2];
-    t.u16[0] = b.u16[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
-                           : _c_v64_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
-                           : _c_v64_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
-  c_v64 t;
-  int endian = !!CONFIG_BIG_ENDIAN * 4;
-  t.s16[3] = (int16_t)a.u8[3 + endian];
-  t.s16[2] = (int16_t)a.u8[2 + endian];
-  t.s16[1] = (int16_t)a.u8[1 + endian];
-  t.s16[0] = (int16_t)a.u8[0 + endian];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
-  c_v64 t;
-  int endian = !!CONFIG_BIG_ENDIAN * 4;
-  t.s16[3] = (int16_t)a.u8[7 - endian];
-  t.s16[2] = (int16_t)a.u8[6 - endian];
-  t.s16[1] = (int16_t)a.u8[5 - endian];
-  t.s16[0] = (int16_t)a.u8[4 - endian];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    c_v64 u = a;
-    a = b;
-    b = u;
-  }
-  t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
-  t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
-  t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
-  t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    c_v64 u = a;
-    a = b;
-    b = u;
-  }
-  t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
-  t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
-  t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
-  t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
-  t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
-  t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
-  t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
-  t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    c_v64 u = a;
-    a = b;
-    b = u;
-  }
-  t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
-  t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
-  t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
-  t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
-  t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
-  t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
-  t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
-  t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) {
-    if (simd_check && (pattern.u8[c] & ~7)) {
-      fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
-              pattern.u8[c], c);
-      abort();
-    }
-    t.u8[c] =
-        a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
-  }
-  return t;
-}
-
-SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
-  return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
-         a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
-         a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
-}
-
-SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
-  return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
-         (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
-}
-
-SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
-  return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
-         a.u8[0];
-}
-
-SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
-  return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
-}
-
-typedef uint32_t c_sad64_internal;
-
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
-
-SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
-                                          c_v64 b) {
-  int c;
-  for (c = 0; c < 8; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
-
-typedef uint32_t c_ssd64_internal;
-
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
-
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
-                                          c_v64 b) {
-  int c;
-  for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
-
-SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 | b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 ^ b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 & b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 & ~b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.s32[0] = a.s32[0] * b.s32[0];
-  t.s32[1] = a.s32[1] * b.s32[1];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
-  t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int32_t u;
-  u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
-  t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
-  t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
-  t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
-  t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (simd_check && n > 7) {
-    fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (simd_check && n > 7) {
-    fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (simd_check && n > 7) {
-    fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (simd_check && n > 15) {
-    fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (simd_check && n > 15) {
-    fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (simd_check && n > 15) {
-    fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
-  c_v64 t;
-  if (simd_check && n > 31) {
-    fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
-    abort();
-  }
-  t.u32[1] = a.u32[1] << n;
-  t.u32[0] = a.u32[0] << n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
-  c_v64 t;
-  if (simd_check && n > 31) {
-    fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
-    abort();
-  }
-  t.u32[1] = a.u32[1] >> n;
-  t.u32[0] = a.u32[0] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
-  c_v64 t;
-  if (simd_check && n > 31) {
-    fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
-    abort();
-  }
-  t.s32[1] = a.s32[1] >> n;
-  t.s32[0] = a.s32[0] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, const unsigned int i) {
-  c_v64 t;
-  t.u64 = x.u64 >> i * 8;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, const unsigned int i) {
-  c_v64 t;
-  t.u64 = x.u64 << i * 8;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, const unsigned int c) {
-  if (simd_check && c > 7) {
-    fprintf(stderr, "Error: undefined alignment %d\n", c);
-    abort();
-  }
-  return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, const unsigned int c) {
-  return c_v64_shl_8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, const unsigned int c) {
-  return c_v64_shr_u8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, const unsigned int c) {
-  return c_v64_shr_s8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, const unsigned int c) {
-  return c_v64_shl_16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, const unsigned int c) {
-  return c_v64_shr_u16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, const unsigned int c) {
-  return c_v64_shr_s16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, const unsigned int c) {
-  return c_v64_shl_32(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, const unsigned int c) {
-  return c_v64_shr_u32(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, const unsigned int c) {
-  return c_v64_shr_s32(a, c);
-}
-
-#endif /* _V64_INTRINSICS_C_H */
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -1,451 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef _V64_INTRINSICS_H
-#define _V64_INTRINSICS_H
-
-#include <emmintrin.h>
-#if defined(__SSSE3__)
-#include <tmmintrin.h>
-#endif
-#if defined(__SSE4_1__)
-#include <smmintrin.h>
-#endif
-
-typedef __m128i v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) {
-  return (uint32_t)_mm_cvtsi128_si32(a);
-}
-
-SIMD_INLINE uint32_t v64_high_u32(v64 a) {
-  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
-}
-
-SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
-
-SIMD_INLINE int32_t v64_high_s32(v64 a) {
-  return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
-}
-
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
-  return _mm_packs_epi32(
-      _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
-      _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return _mm_set_epi32(0, 0, x, y);
-}
-
-SIMD_INLINE v64 v64_from_64(uint64_t x) {
-#ifdef __x86_64__
-  return _mm_cvtsi64_si128(x);
-#else
-  return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
-#endif
-}
-
-SIMD_INLINE uint64_t v64_u64(v64 x) {
-  return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
-}
-
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
-  return *((uint32_t *)p);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
-  return *((uint32_t *)p);
-}
-
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
-  *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-  *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
-  return _mm_loadl_epi64((__m128i *)p);
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
-  return _mm_loadl_epi64((__m128i *)p);
-}
-
-SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
-  _mm_storel_epi64((__m128i *)p, a);
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
-  _mm_storel_epi64((__m128i *)p, a);
-}
-
-// The following function requires an immediate.
-#if __OPTIMIZE__
-#define v64_align(a, b, c) \
-  ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
-#else
-#define v64_align(a, b, c)                                                  \
-  ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
-       : (b))
-#endif
-
-SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
-
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
-
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
-
-SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
-
-SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
-
-SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
-
-SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
-
-SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
-
-SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
-
-SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
-
-SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
-
-SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
-
-SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
-
-SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
-
-SIMD_INLINE v64 v64_abs_s16(v64 a) {
-#if defined(__SSSE3__)
-  return _mm_abs_epi16(a);
-#else
-  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
-  return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
-  return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
-  return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
-  __m128i t = _mm_unpacklo_epi64(b, a);
-  return _mm_packs_epi32(t, t);
-}
-
-SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
-  __m128i t = _mm_unpacklo_epi64(b, a);
-  return _mm_packus_epi16(t, t);
-}
-
-SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
-  __m128i t = _mm_unpacklo_epi64(b, a);
-  return _mm_packs_epi16(t, t);
-}
-
-SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0f0d0b0907050301LL));
-#else
-  return _mm_packus_epi16(
-      _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
-      _mm_setzero_si128());
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0e0c0a0806040200LL));
-#else
-  return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0f0e0b0a07060302LL));
-#else
-  return _mm_packs_epi32(
-      _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
-      _mm_setzero_si128());
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0d0c090805040100LL));
-#else
-  return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
-  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
-  return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
-}
-
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
-  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
-  return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
-}
-
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
-  return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
-}
-
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
-  return _mm_srli_si128(
-      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
-}
-
-SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(x, pattern);
-#else
-  v64 output;
-  unsigned char *input = (unsigned char *)&x;
-  unsigned char *index = (unsigned char *)&pattern;
-  char *selected = (char *)&output;
-  int counter;
-
-  for (counter = 0; counter < 8; counter++) {
-    selected[counter] = input[index[counter]];
-  }
-
-  return output;
-#endif
-}
-
-SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
-  __m128i r, r1, r2, z;
-  z = _mm_setzero_si128();
-  r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8),
-                      _mm_unpacklo_epi8(b, z));
-  r2 = _mm_srli_si128(r1, 8);
-  r = _mm_add_epi32(r1, r2);
-  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
-  return ((int32_t)v64_low_u32(r)) >> 8;
-}
-
-SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
-  __m128i r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__) && defined(__x86_64__)
-  __m128i x = _mm_cvtepi32_epi64(r);
-  return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
-#else
-  return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
-         (int64_t)_mm_cvtsi128_si32(r);
-#endif
-}
-
-SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
-  return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
-}
-
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return v64_dotp_s16(a, v64_dup_16(1));
-}
-
-typedef v64 sad64_internal;
-
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
-  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
-
-typedef v64 ssd64_internal;
-
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
-  v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
-  v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
-  v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
-  return _mm_add_epi64(
-      s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
-}
-
-SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
-
-SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
-
-SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
-
-SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
-
-SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
-
-SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
-
-SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
-
-SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
-#if defined(__SSE4_1__)
-  return _mm_mullo_epi32(a, b);
-#else
-  return _mm_unpacklo_epi32(
-      _mm_mul_epu32(a, b),
-      _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
-#endif
-}
-
-SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
-
-SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_maddubs_epi16(a, b);
-#else
-  __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                             _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
-  return _mm_packs_epi32(t, t);
-#endif
-}
-
-SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
-
-SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
-  return _mm_sub_epi8(_mm_avg_epu8(a, b),
-                      _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
-}
-
-SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
-
-SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
-
-SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
-
-SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
-#if defined(__SSE4_1__)
-  return _mm_min_epi8(a, b);
-#else
-  v64 mask = _mm_cmplt_epi8(a, b);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
-#if defined(__SSE4_1__)
-  return _mm_max_epi8(a, b);
-#else
-  v64 mask = _mm_cmplt_epi8(b, a);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
-
-SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
-  return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
-}
-
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
-#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
-#define v64_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
-#define v64_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
-#define v64_shr_n_s8(a, c) \
-  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
-#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
-#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
-#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
-#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
-#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
-#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
-
-#endif /* _V64_INTRINSICS_H */
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_VARIANCE_H_
-#define AOM_DSP_VARIANCE_H_
-
-#include "./aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-#define FILTER_WEIGHT 128
-
-typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
-                                     const uint8_t *b, int b_stride);
-
-typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
-                                         const uint8_t *b, int b_stride,
-                                         const uint8_t *second_pred);
-
-typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
-                                  int b_stride, int n);
-
-typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   unsigned int *sad_array);
-
-typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
-                                     const uint8_t *const b_array[],
-                                     int b_stride, unsigned int *sad_array);
-
-typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          unsigned int *sse);
-
-typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
-                                                int xoffset, int yoffset,
-                                                const uint8_t *b, int b_stride,
-                                                unsigned int *sse);
-
-typedef unsigned int (*aom_subp_avg_variance_fn_t)(
-    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
-    int b_stride, unsigned int *sse, const uint8_t *second_pred);
-
-#if CONFIG_AV1 && CONFIG_EXT_INTER
-typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
-                                            const uint8_t *ref, int ref_stride,
-                                            const uint8_t *msk_ptr,
-                                            int msk_stride);
-typedef unsigned int (*aom_masked_variance_fn_t)(
-    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
-    const uint8_t *msk, int msk_stride, unsigned int *sse);
-typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride,
-    unsigned int *sse);
-#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
-
-#if CONFIG_AV1 && CONFIG_MOTION_VAR
-typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
-                                          const int32_t *wsrc,
-                                          const int32_t *msk);
-typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
-                                               int pred_stride,
-                                               const int32_t *wsrc,
-                                               const int32_t *msk,
-                                               unsigned int *sse);
-typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
-    const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
-    const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
-#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
-
-#if CONFIG_AV1
-typedef struct aom_variance_vtable {
-  aom_sad_fn_t sdf;
-  aom_sad_avg_fn_t sdaf;
-  aom_variance_fn_t vf;
-  aom_subpixvariance_fn_t svf;
-  aom_subp_avg_variance_fn_t svaf;
-  aom_sad_multi_fn_t sdx3f;
-  aom_sad_multi_fn_t sdx8f;
-  aom_sad_multi_d_fn_t sdx4df;
-#if CONFIG_EXT_INTER
-  aom_masked_sad_fn_t msdf;
-  aom_masked_variance_fn_t mvf;
-  aom_masked_subpixvariance_fn_t msvf;
-#endif  // CONFIG_EXT_INTER
-#if CONFIG_MOTION_VAR
-  aom_obmc_sad_fn_t osdf;
-  aom_obmc_variance_fn_t ovf;
-  aom_obmc_subpixvariance_fn_t osvf;
-#endif  // CONFIG_MOTION_VAR
-} aom_variance_fn_ptr_t;
-#endif  // CONFIG_AV1
-
-void aom_highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter);
-
-void aom_highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, unsigned int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_DSP_VARIANCE_H_
--- a/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom/aom_integer.h"
-
-#include "./aom_dsp_rtcd.h"
-
-// To start out, just dispatch to the function using the 2D mask and
-// pass mask stride as 0. This can be improved upon if necessary.
-
-void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
-  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, 0, h, w, 0, 0);
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void aom_highbd_blend_a64_hmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
-    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
-  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
-                                   src1_8, src1_stride, mask, 0, h, w, 0, 0,
-                                   bd);
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -1,924 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-
-#include "./aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_b = xx_loadl_64(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_m0l_b = xx_loadl_64(mask + c);
-      const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
-      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
-      const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_r_b = xx_loadu_128(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
-      const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
-
-      const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
-      const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadl_32(mask);
-    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_zero = _mm_setzero_si128();
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ra_b = xx_loadu_128(mask + c);
-      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
-      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
-      const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadu_128(mask);
-    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
-      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
-      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
-      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
-      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
-      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
-      const __m128i v_rvsbl_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
-      const __m128i v_rvsbh_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
-      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
-      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
-
-      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
-      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
-      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
-      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
-
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                               const uint8_t *src0, uint32_t src0_stride,
-                               const uint8_t *src1, uint32_t src1_stride,
-                               const uint8_t *mask, uint32_t mask_stride, int h,
-                               int w, int suby, int subx) {
-  typedef void (*blend_fn)(
-      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
-      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
-
-  // Dimensions are: width_index X subx X suby
-  static const blend_fn blend[3][2][2] = {
-    { // w % 16 == 0
-      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
-      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
-    { // w == 4
-      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
-      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
-    { // w == 8
-      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
-      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, h, w, suby, subx);
-  } else {
-    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
-                                              src0_stride, src1, src1_stride,
-                                              mask, mask_stride, h, w);
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h, blend_4_b10);
-}
-
-static void blend_a64_mask_b12_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_m0_b = xx_loadl_64(mask + c);
-      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b10);
-}
-
-static void blend_a64_mask_b12_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_ra_b = xx_loadl_32(mask);
-    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = xx_loadl_64(mask + c);
-      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
-      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h,
-                                    blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  (void)w;
-  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h,
-                                    blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
-      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-      const __m128i v_rvsb_w =
-          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
-                                     blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w) {
-  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, h, w,
-                                     blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
-                                      const uint8_t *src0_8,
-                                      uint32_t src0_stride,
-                                      const uint8_t *src1_8,
-                                      uint32_t src1_stride, const uint8_t *mask,
-                                      uint32_t mask_stride, int h, int w,
-                                      int suby, int subx, int bd) {
-  typedef void (*blend_fn)(
-      uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
-      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int h, int w);
-
-  // Dimensions are: bd_index X width_index X subx X suby
-  static const blend_fn blend[2][2][2][2] = {
-    {   // bd == 8 or 10
-      { // w % 8 == 0
-        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
-        { blend_a64_mask_b10_sx_w8n_sse4_1,
-          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
-      { // w == 4
-        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
-        { blend_a64_mask_b10_sx_w4_sse4_1,
-          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
-    {   // bd == 12
-      { // w % 8 == 0
-        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
-        { blend_a64_mask_b12_sx_w8n_sse4_1,
-          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
-      { // w == 4
-        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
-        { blend_a64_mask_b12_sx_w4_sse4_1,
-          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
-  };
-
-  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
-  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, h, w, suby,
-                                subx, bd);
-  } else {
-    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
-        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-        mask_stride, h, w);
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -1,285 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-
-#include "./aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                        const uint8_t *src0,
-                                        uint32_t src0_stride,
-                                        const uint8_t *src1,
-                                        uint32_t src1_stride,
-                                        const uint8_t *mask, int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int h, int w) {
-  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
-
-  // Dimension: width_index
-  static const blend_fn blend[9] = {
-    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
-    aom_blend_a64_vmask_c,        // w == 1
-    aom_blend_a64_vmask_c,        // w == 2
-    NULL,                         // INVALID
-    blend_a64_vmask_w4_sse4_1,    // w == 4
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    blend_a64_vmask_w8_sse4_1,    // w == 8
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h,
-                 w);
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_vmask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                          const uint16_t *src0,
-                                          uint32_t src0_stride,
-                                          const uint16_t *src1,
-                                          uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
-  (void)w;
-  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h, blend_4_b10);
-}
-
-static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                          const uint16_t *src0,
-                                          uint32_t src0_stride,
-                                          const uint16_t *src1,
-                                          uint32_t src1_stride,
-                                          const uint8_t *mask, int h, int w) {
-  (void)w;
-  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                           const uint16_t *src0,
-                                           uint32_t src0_stride,
-                                           const uint16_t *src1,
-                                           uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
-  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b10);
-}
-
-static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                           const uint16_t *src0,
-                                           uint32_t src0_stride,
-                                           const uint16_t *src1,
-                                           uint32_t src1_stride,
-                                           const uint8_t *mask, int h, int w) {
-  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, h, w, blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_vmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
-    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int h, int w, int bd) {
-  typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
-                           const uint16_t *src0, uint32_t src0_stride,
-                           const uint16_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int h, int w);
-
-  // Dimensions are: bd_index X width_index
-  static const blend_fn blend[2][2] = {
-    {
-        // bd == 8 or 10
-        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
-        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
-    },
-    {
-        // bd == 12
-        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
-        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
-    }
-  };
-
-  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
-  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                 src1_stride, mask, h, w, bd);
-  } else {
-    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, h, w);
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/x86/blend_sse4.h
+++ b/aom_dsp/x86/blend_sse4.h
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_BLEND_SSE4_H_
-#define AOM_DSP_X86_BLEND_SSE4_H_
-
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/synonyms.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_32(src0);
-  const __m128i v_s1_b = xx_loadl_32(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_64(src0);
-  const __m128i v_s1_b = xx_loadl_64(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
-                                 const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  // Interleave
-  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
-  // Scale
-  const __m128i v_ssum_d =
-      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  // Interleave
-  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
-  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
-  // Scale
-  const __m128i v_ssuml_d =
-      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
-  const __m128i v_ssumh_d =
-      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-#endif  // AOM_DSP_X86_BLEND_SSE4_H_
--- a/aom_dsp/x86/fwd_dct32_8cols_sse2.c
+++ b/aom_dsp/x86/fwd_dct32_8cols_sse2.c
@@ -1,862 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "aom_dsp/fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// Apply a 32-element IDCT to 8 columns. This does not do any transposition
-// of its output - the caller is expected to do that.
-// The input buffers are the top and bottom halves of an 8x32 block.
-void fdct32_8col(__m128i *in0, __m128i *in1) {
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i step1[32];
-  __m128i step2[32];
-  __m128i step3[32];
-  __m128i out[32];
-  // Stage 1
-  {
-    const __m128i *ina = in0;
-    const __m128i *inb = in1 + 15;
-    __m128i *step1a = &step1[0];
-    __m128i *step1b = &step1[31];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 4;
-    const __m128i *inb = in1 + 11;
-    __m128i *step1a = &step1[4];
-    __m128i *step1b = &step1[27];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 8;
-    const __m128i *inb = in1 + 7;
-    __m128i *step1a = &step1[8];
-    __m128i *step1b = &step1[23];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  {
-    const __m128i *ina = in0 + 12;
-    const __m128i *inb = in1 + 3;
-    __m128i *step1a = &step1[12];
-    __m128i *step1b = &step1[19];
-    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
-    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
-    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
-    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
-    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
-    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
-    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-    step1a[0] = _mm_add_epi16(ina0, inb0);
-    step1a[1] = _mm_add_epi16(ina1, inb1);
-    step1a[2] = _mm_add_epi16(ina2, inb2);
-    step1a[3] = _mm_add_epi16(ina3, inb3);
-    step1b[-3] = _mm_sub_epi16(ina3, inb3);
-    step1b[-2] = _mm_sub_epi16(ina2, inb2);
-    step1b[-1] = _mm_sub_epi16(ina1, inb1);
-    step1b[-0] = _mm_sub_epi16(ina0, inb0);
-  }
-  // Stage 2
-  {
-    step2[0] = _mm_add_epi16(step1[0], step1[15]);
-    step2[1] = _mm_add_epi16(step1[1], step1[14]);
-    step2[2] = _mm_add_epi16(step1[2], step1[13]);
-    step2[3] = _mm_add_epi16(step1[3], step1[12]);
-    step2[4] = _mm_add_epi16(step1[4], step1[11]);
-    step2[5] = _mm_add_epi16(step1[5], step1[10]);
-    step2[6] = _mm_add_epi16(step1[6], step1[9]);
-    step2[7] = _mm_add_epi16(step1[7], step1[8]);
-    step2[8] = _mm_sub_epi16(step1[7], step1[8]);
-    step2[9] = _mm_sub_epi16(step1[6], step1[9]);
-    step2[10] = _mm_sub_epi16(step1[5], step1[10]);
-    step2[11] = _mm_sub_epi16(step1[4], step1[11]);
-    step2[12] = _mm_sub_epi16(step1[3], step1[12]);
-    step2[13] = _mm_sub_epi16(step1[2], step1[13]);
-    step2[14] = _mm_sub_epi16(step1[1], step1[14]);
-    step2[15] = _mm_sub_epi16(step1[0], step1[15]);
-  }
-  {
-    const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-    const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-    const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-    const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-    const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-    const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-    const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-    const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-    const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-    const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-    const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-    const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-    const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-    const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-    const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-    const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-    const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-    const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-    const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-    const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-    const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-    const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-    const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-    const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-    const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-    const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-    const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-    const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-    const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-    const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-    const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-    const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-    const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-    const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-    const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-    const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-    const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-    const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-    const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-    // Combine
-    step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-    step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-    step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-    step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-    step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-    step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-    step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-    step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-  }
-  // Stage 3
-  {
-    step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
-    step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
-    step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
-    step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
-    step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
-    step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
-    step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
-    step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
-  }
-  {
-    const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-    const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-    const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-    const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-    const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-    const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-    const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-    const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-    const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-    const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-    const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-    const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-    const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-    const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-    const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-    const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-    const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-    const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-    const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-    // Combine
-    step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-    step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-    step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-    step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-  }
-  {
-    step3[16] = _mm_add_epi16(step2[23], step1[16]);
-    step3[17] = _mm_add_epi16(step2[22], step1[17]);
-    step3[18] = _mm_add_epi16(step2[21], step1[18]);
-    step3[19] = _mm_add_epi16(step2[20], step1[19]);
-    step3[20] = _mm_sub_epi16(step1[19], step2[20]);
-    step3[21] = _mm_sub_epi16(step1[18], step2[21]);
-    step3[22] = _mm_sub_epi16(step1[17], step2[22]);
-    step3[23] = _mm_sub_epi16(step1[16], step2[23]);
-    step3[24] = _mm_sub_epi16(step1[31], step2[24]);
-    step3[25] = _mm_sub_epi16(step1[30], step2[25]);
-    step3[26] = _mm_sub_epi16(step1[29], step2[26]);
-    step3[27] = _mm_sub_epi16(step1[28], step2[27]);
-    step3[28] = _mm_add_epi16(step2[27], step1[28]);
-    step3[29] = _mm_add_epi16(step2[26], step1[29]);
-    step3[30] = _mm_add_epi16(step2[25], step1[30]);
-    step3[31] = _mm_add_epi16(step2[24], step1[31]);
-  }
-
-  // Stage 4
-  {
-    step1[0] = _mm_add_epi16(step3[3], step3[0]);
-    step1[1] = _mm_add_epi16(step3[2], step3[1]);
-    step1[2] = _mm_sub_epi16(step3[1], step3[2]);
-    step1[3] = _mm_sub_epi16(step3[0], step3[3]);
-    step1[8] = _mm_add_epi16(step3[11], step2[8]);
-    step1[9] = _mm_add_epi16(step3[10], step2[9]);
-    step1[10] = _mm_sub_epi16(step2[9], step3[10]);
-    step1[11] = _mm_sub_epi16(step2[8], step3[11]);
-    step1[12] = _mm_sub_epi16(step2[15], step3[12]);
-    step1[13] = _mm_sub_epi16(step2[14], step3[13]);
-    step1[14] = _mm_add_epi16(step3[13], step2[14]);
-    step1[15] = _mm_add_epi16(step3[12], step2[15]);
-  }
-  {
-    const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-    const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-    const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-    const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-    const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-    const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-    // dct_const_round_shift
-    const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-    const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-    const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-    const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-    // Combine
-    step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-    step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-  }
-  {
-    const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-    const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-    const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-    const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-    const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-    const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-    const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-    const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-    const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-    const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-    const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-    const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-    const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-    const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-    const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-    const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-    const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-    const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-    const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-    const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-    const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-    const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-    const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-    const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-    const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-    const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-    const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-    const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-    const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-    const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-    const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-    const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-    const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-    const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-    const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-    const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-    const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-    const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-    const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-    // Combine
-    step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-    step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-    step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-    step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-    step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-    step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-    step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-    step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-  }
-  // Stage 5
-  {
-    step2[4] = _mm_add_epi16(step1[5], step3[4]);
-    step2[5] = _mm_sub_epi16(step3[4], step1[5]);
-    step2[6] = _mm_sub_epi16(step3[7], step1[6]);
-    step2[7] = _mm_add_epi16(step1[6], step3[7]);
-  }
-  {
-    const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-    const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-    const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-    const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-    const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-    const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-    const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-    const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-    const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-    const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-    const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-    const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-    // dct_const_round_shift
-    const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-    const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-    const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-    const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-    const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-    const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-    const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-    const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-    // Combine
-    out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-    out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-    out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-    out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-  }
-  {
-    const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-    const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-    const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-    const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-    const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-    const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-    const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-    const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-    const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-    const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-    const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-    const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-    // dct_const_round_shift
-    const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-    const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-    const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-    const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-    const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-    const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-    const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-    const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-    // Combine
-    step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-    step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-    step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-    step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-  }
-  {
-    step2[16] = _mm_add_epi16(step1[19], step3[16]);
-    step2[17] = _mm_add_epi16(step1[18], step3[17]);
-    step2[18] = _mm_sub_epi16(step3[17], step1[18]);
-    step2[19] = _mm_sub_epi16(step3[16], step1[19]);
-    step2[20] = _mm_sub_epi16(step3[23], step1[20]);
-    step2[21] = _mm_sub_epi16(step3[22], step1[21]);
-    step2[22] = _mm_add_epi16(step1[21], step3[22]);
-    step2[23] = _mm_add_epi16(step1[20], step3[23]);
-    step2[24] = _mm_add_epi16(step1[27], step3[24]);
-    step2[25] = _mm_add_epi16(step1[26], step3[25]);
-    step2[26] = _mm_sub_epi16(step3[25], step1[26]);
-    step2[27] = _mm_sub_epi16(step3[24], step1[27]);
-    step2[28] = _mm_sub_epi16(step3[31], step1[28]);
-    step2[29] = _mm_sub_epi16(step3[30], step1[29]);
-    step2[30] = _mm_add_epi16(step1[29], step3[30]);
-    step2[31] = _mm_add_epi16(step1[28], step3[31]);
-  }
-  // Stage 6
-  {
-    const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-    const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-    const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-    const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-    const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-    const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-    const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-    const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-    const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-    const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-    const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-    const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-    // dct_const_round_shift
-    const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-    const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-    const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-    const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-    const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-    const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-    const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-    const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-    // Combine
-    out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-    out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-    out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-    out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-  }
-  {
-    step3[8] = _mm_add_epi16(step2[9], step1[8]);
-    step3[9] = _mm_sub_epi16(step1[8], step2[9]);
-    step3[10] = _mm_sub_epi16(step1[11], step2[10]);
-    step3[11] = _mm_add_epi16(step2[10], step1[11]);
-    step3[12] = _mm_add_epi16(step2[13], step1[12]);
-    step3[13] = _mm_sub_epi16(step1[12], step2[13]);
-    step3[14] = _mm_sub_epi16(step1[15], step2[14]);
-    step3[15] = _mm_add_epi16(step2[14], step1[15]);
-  }
-  {
-    const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-    const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-    const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-    const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-    const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-    const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-    const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-    const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-    const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-    const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-    const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-    const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-    const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-    const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-    const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-    const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-    const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-    const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-    const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-    const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-    const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-    const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-    const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-    const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-    // dct_const_round_shift
-    const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-    const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-    const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-    const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-    const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-    const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-    const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-    const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-    const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-    const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-    const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-    const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-    const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-    const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-    const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-    const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-    // Combine
-    step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-    step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-    step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-    step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-    // Combine
-    step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-    step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-    step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-    step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-  }
-  // Stage 7
-  {
-    const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-    const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-    const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-    const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-    const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-    const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-    const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-    const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-    const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-    const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-    const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-    const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-    const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-    const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-    const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-    const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-    const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-    const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-    const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-    const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-    const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-    const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-    const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-    const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-    // dct_const_round_shift
-    const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-    const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-    const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-    const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-    const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-    const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-    const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-    const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-    const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-    const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-    const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-    const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-    const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-    const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-    const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-    const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-    // Combine
-    out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-    out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-    out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-    out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-    out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-    out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-    out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-    out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-  }
-  {
-    step1[16] = _mm_add_epi16(step3[17], step2[16]);
-    step1[17] = _mm_sub_epi16(step2[16], step3[17]);
-    step1[18] = _mm_sub_epi16(step2[19], step3[18]);
-    step1[19] = _mm_add_epi16(step3[18], step2[19]);
-    step1[20] = _mm_add_epi16(step3[21], step2[20]);
-    step1[21] = _mm_sub_epi16(step2[20], step3[21]);
-    step1[22] = _mm_sub_epi16(step2[23], step3[22]);
-    step1[23] = _mm_add_epi16(step3[22], step2[23]);
-    step1[24] = _mm_add_epi16(step3[25], step2[24]);
-    step1[25] = _mm_sub_epi16(step2[24], step3[25]);
-    step1[26] = _mm_sub_epi16(step2[27], step3[26]);
-    step1[27] = _mm_add_epi16(step3[26], step2[27]);
-    step1[28] = _mm_add_epi16(step3[29], step2[28]);
-    step1[29] = _mm_sub_epi16(step2[28], step3[29]);
-    step1[30] = _mm_sub_epi16(step2[31], step3[30]);
-    step1[31] = _mm_add_epi16(step3[30], step2[31]);
-  }
-  // Final stage --- outputs indices are bit-reversed.
-  {
-    const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-    const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-    const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-    const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-    const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-    const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-    const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-    const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-    const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-    const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-    const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-    const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-    const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-    const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-    const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-    const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-    const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-    const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-    const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-    const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-    const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-    const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-    const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-    const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-    // dct_const_round_shift
-    const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-    const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-    const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-    const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-    const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-    const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-    const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-    const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-    const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-    const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-    const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-    const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-    const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-    const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-    const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-    const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-    // Combine
-    out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-    out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-    out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-    out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-    out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-    out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-    out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-    out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-  }
-  {
-    const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-    const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-    const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-    const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-    const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-    const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-    const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-    const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-    const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-    const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-    const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-    const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-    const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-    const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-    const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-    const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-    const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-    const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-    const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-    const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-    const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-    const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-    const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-    const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-    // dct_const_round_shift
-    const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-    const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-    const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-    const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-    const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-    const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-    const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-    const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-    const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-    const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-    const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-    const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-    const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-    const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-    const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-    const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-    const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-    const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-    // Combine
-    out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-    out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-    out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-    out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-    out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-    out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-    out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-    out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-  }
-
-  // Output results
-  {
-    int j;
-    for (j = 0; j < 16; ++j) {
-      _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
-      _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
-    }
-  }
-}  // NOLINT
--- a/aom_dsp/x86/fwd_txfm_avx2.c
+++ b/aom_dsp/x86/fwd_txfm_avx2.c
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_config.h"
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
-#undef FDCT32x32_2D_AVX2
-#undef FDCT32x32_HIGH_PRECISION
--- a/aom_dsp/x86/fwd_txfm_avx2.h
+++ b/aom_dsp/x86/fwd_txfm_avx2.h
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
-#define AOM_DSP_X86_FWD_TXFM_AVX2_H
-
-#include "./aom_config.h"
-
-static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-#if CONFIG_AOM_HIGHBITDEPTH
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
-
-  __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
-  __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
-
-  __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
-  __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
-
-  _mm256_storeu_si256((__m256i *)out, y0);
-  _mm256_storeu_si256((__m256i *)(out + 8), y1);
-#else
-  _mm256_storeu_si256((__m256i *)out, *coeff);
-#endif
-}
-
-#endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
--- a/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/aom_dsp/x86/highbd_subtract_sse2.c
@@ -1,363 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <stddef.h>
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-
-typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
-                                    const uint16_t *src, ptrdiff_t src_stride,
-                                    const uint16_t *pred,
-                                    ptrdiff_t pred_stride);
-
-static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3;
-  __m128i x0, x1, x2, x3;
-  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-
-  _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *)(diff + 1 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *)(diff + 2 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *)(diff + 3 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x3);
-}
-
-static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-  x4 = _mm_sub_epi16(u4, v4);
-  x5 = _mm_sub_epi16(u5, v5);
-  x6 = _mm_sub_epi16(u6, v6);
-  x7 = _mm_sub_epi16(u7, v7);
-
-  _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *)(diff + 1 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *)(diff + 2 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *)(diff + 3 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x3);
-  store_diff = (int64_t *)(diff + 4 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x4);
-  store_diff = (int64_t *)(diff + 5 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x5);
-  store_diff = (int64_t *)(diff + 6 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x6);
-  store_diff = (int64_t *)(diff + 7 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x7);
-}
-
-static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3;
-  __m128i x0, x1, x2, x3;
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-
-  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
-}
-
-static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-  x4 = _mm_sub_epi16(u4, v4);
-  x5 = _mm_sub_epi16(u5, v5);
-  x6 = _mm_sub_epi16(u6, v6);
-  x7 = _mm_sub_epi16(u7, v7);
-
-  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
-  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
-  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
-  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
-  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
-}
-
-static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
-                          const uint16_t *src, ptrdiff_t src_stride,
-                          const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 3;
-  src += src_stride << 3;
-  pred += pred_stride << 3;
-  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
-                          const uint16_t *src, ptrdiff_t src_stride,
-                          const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += 8;
-  src += 8;
-  pred += 8;
-  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
-                           const uint16_t *src, ptrdiff_t src_stride,
-                           const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 3;
-  src += src_stride << 3;
-  pred += pred_stride << 3;
-  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
-                           const uint16_t *src, ptrdiff_t src_stride,
-                           const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 4;
-  src += src_stride << 4;
-  pred += pred_stride << 4;
-  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
-                           const uint16_t *src, ptrdiff_t src_stride,
-                           const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += 16;
-  src += 16;
-  pred += 16;
-  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
-                           const uint16_t *src, ptrdiff_t src_stride,
-                           const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 4;
-  src += src_stride << 4;
-  pred += pred_stride << 4;
-  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
-                           const uint16_t *src, ptrdiff_t src_stride,
-                           const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 5;
-  src += src_stride << 5;
-  pred += pred_stride << 5;
-  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
-                           const uint16_t *src, ptrdiff_t src_stride,
-                           const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += 32;
-  src += 32;
-  pred += 32;
-  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
-                           const uint16_t *src, ptrdiff_t src_stride,
-                           const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 5;
-  src += src_stride << 5;
-  pred += pred_stride << 5;
-  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
-                            const uint16_t *src, ptrdiff_t src_stride,
-                            const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 6;
-  src += src_stride << 6;
-  pred += pred_stride << 6;
-  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
-                            const uint16_t *src, ptrdiff_t src_stride,
-                            const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += 64;
-  src += 64;
-  pred += 64;
-  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
-                             const uint16_t *src, ptrdiff_t src_stride,
-                             const uint16_t *pred, ptrdiff_t pred_stride) {
-  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-  diff += diff_stride << 6;
-  src += src_stride << 6;
-  pred += pred_stride << 6;
-  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
-
-static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
-  SubtractWxHFuncType ret_func_ptr = NULL;
-  if (rows == 4) {
-    if (cols == 4) {
-      ret_func_ptr = subtract_4x4;
-    } else if (cols == 8) {
-      ret_func_ptr = subtract_8x4;
-    }
-  } else if (rows == 8) {
-    if (cols == 4) {
-      ret_func_ptr = subtract_4x8;
-    } else if (cols == 8) {
-      ret_func_ptr = subtract_8x8;
-    } else if (cols == 16) {
-      ret_func_ptr = subtract_16x8;
-    }
-  } else if (rows == 16) {
-    if (cols == 8) {
-      ret_func_ptr = subtract_8x16;
-    } else if (cols == 16) {
-      ret_func_ptr = subtract_16x16;
-    } else if (cols == 32) {
-      ret_func_ptr = subtract_32x16;
-    }
-  } else if (rows == 32) {
-    if (cols == 16) {
-      ret_func_ptr = subtract_16x32;
-    } else if (cols == 32) {
-      ret_func_ptr = subtract_32x32;
-    } else if (cols == 64) {
-      ret_func_ptr = subtract_64x32;
-    }
-  } else if (rows == 64) {
-    if (cols == 32) {
-      ret_func_ptr = subtract_32x64;
-    } else if (cols == 64) {
-      ret_func_ptr = subtract_64x64;
-    } else if (cols == 128) {
-      ret_func_ptr = subtract_128x64;
-    }
-  } else if (rows == 128) {
-    if (cols == 64) {
-      ret_func_ptr = subtract_64x128;
-    } else if (cols == 128) {
-      ret_func_ptr = subtract_128x128;
-    }
-  }
-  if (!ret_func_ptr) {
-    assert(0);
-  }
-  return ret_func_ptr;
-}
-
-void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
-                                    ptrdiff_t diff_stride, const uint8_t *src8,
-                                    ptrdiff_t src_stride, const uint8_t *pred8,
-                                    ptrdiff_t pred_stride, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  SubtractWxHFuncType func;
-  (void)bd;
-
-  func = getSubtractFunc(rows, cols);
-  func(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
--- a/aom_dsp/x86/highbd_variance_sse4.c
+++ b/aom_dsp/x86/highbd_variance_sse4.c
@@ -1,215 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <smmintrin.h> /* SSE4.1 */
-
-#include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
-
-#include "aom_dsp/variance.h"
-#include "aom_dsp/aom_filter.h"
-
-static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
-                                         const uint8_t *b8, int b_stride,
-                                         uint64_t *sse, int64_t *sum) {
-  __m128i u0, u1, u2, u3;
-  __m128i s0, s1, s2, s3;
-  __m128i t0, t1, x0, y0;
-  __m128i a0, a1, a2, a3;
-  __m128i b0, b1, b2, b3;
-  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
-  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
-  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
-  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
-
-  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
-  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
-  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
-  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
-
-  u0 = _mm_unpacklo_epi16(a0, a1);
-  u1 = _mm_unpacklo_epi16(a2, a3);
-  u2 = _mm_unpacklo_epi16(b0, b1);
-  u3 = _mm_unpacklo_epi16(b2, b3);
-
-  s0 = _mm_sub_epi16(u0, u2);
-  s1 = _mm_sub_epi16(u1, u3);
-
-  t0 = _mm_madd_epi16(s0, k_one_epi16);
-  t1 = _mm_madd_epi16(s1, k_one_epi16);
-
-  s2 = _mm_hadd_epi32(t0, t1);
-  s3 = _mm_hadd_epi32(s2, s2);
-  y0 = _mm_hadd_epi32(s3, s3);
-
-  t0 = _mm_madd_epi16(s0, s0);
-  t1 = _mm_madd_epi16(s1, s1);
-
-  s2 = _mm_hadd_epi32(t0, t1);
-  s3 = _mm_hadd_epi32(s2, s2);
-  x0 = _mm_hadd_epi32(s3, s3);
-
-  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
-  *sum = (int64_t)_mm_extract_epi32(y0, 0);
-}
-
-uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                         const uint8_t *b, int b_stride,
-                                         uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)local_sse;
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
-  sum = ROUND_POWER_OF_TWO(sum, 2);
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
-  sum = ROUND_POWER_OF_TWO(sum, 4);
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return diff >= 0 ? (uint32_t)diff : 0;
-}
-
-// Sub-pixel
-uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
-                                  sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
-                                   dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
-                                   dst_stride, sse);
-}
-
-// Sub-pixel average
-
-uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
-                           4);
-
-  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
-                                  sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
-                           4);
-
-  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
-                                   dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
-                           4);
-
-  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
-                                   dst_stride, sse);
-}
--- a/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -1,333 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-#include "aom_ports/mem.h"
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-
-static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
-  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
-  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
-  return _mm_unpacklo_epi64(temp1, temp2);
-}
-
-static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
-  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
-  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
-  __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
-  temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
-  temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
-  temp1 = _mm_unpacklo_epi32(temp1, temp2);
-  return _mm_unpacklo_epi64(temp3, temp1);
-}
-
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height);
-
-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height);
-
-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height);
-
-#define MASKSADMXN_SSSE3(m, n)                                                 \
-  unsigned int aom_masked_sad##m##x##n##_ssse3(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *msk, int msk_stride) {                                    \
-    return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
-                            m, n);                                             \
-  }
-
-#if CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(128, 128)
-MASKSADMXN_SSSE3(128, 64)
-MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-MASKSADMXN_SSSE3(64, 64)
-MASKSADMXN_SSSE3(64, 32)
-MASKSADMXN_SSSE3(32, 64)
-MASKSADMXN_SSSE3(32, 32)
-MASKSADMXN_SSSE3(32, 16)
-MASKSADMXN_SSSE3(16, 32)
-MASKSADMXN_SSSE3(16, 16)
-MASKSADMXN_SSSE3(16, 8)
-
-#define MASKSAD8XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
-                               msk_stride, n);                                \
-  }
-
-MASKSAD8XN_SSSE3(16)
-MASKSAD8XN_SSSE3(8)
-MASKSAD8XN_SSSE3(4)
-
-#define MASKSAD4XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
-                               msk_stride, n);                                \
-  }
-
-MASKSAD4XN_SSSE3(8)
-MASKSAD4XN_SSSE3(4)
-
-// For width a multiple of 16
-// Assumes values in m are <=64
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height) {
-  int y, x;
-  __m128i a, b, m, temp1, temp2;
-  __m128i res = _mm_setzero_si128();
-  __m128i one = _mm_set1_epi16(1);
-  // For each row
-  for (y = 0; y < height; y++) {
-    // Covering the full width
-    for (x = 0; x < width; x += 16) {
-      // Load a, b, m in xmm registers
-      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
-      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
-      m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
-
-      // Calculate the difference between a & b
-      temp1 = _mm_subs_epu8(a, b);
-      temp2 = _mm_subs_epu8(b, a);
-      temp1 = _mm_or_si128(temp1, temp2);
-
-      // Multiply by m and add together
-      temp2 = _mm_maddubs_epi16(temp1, m);
-      // Pad out row result to 32 bit integers & add to running total
-      res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
-    }
-    // Move onto the next row
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int masked_sad8xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height) {
-  int y;
-  __m128i a, b, m, temp1, temp2, row_res;
-  __m128i res = _mm_setzero_si128();
-  __m128i one = _mm_set1_epi16(1);
-  // Add the masked SAD for 2 rows at a time
-  for (y = 0; y < height; y += 2) {
-    // Load a, b, m in xmm registers
-    a = width8_load_2rows(a_ptr, a_stride);
-    b = width8_load_2rows(b_ptr, b_stride);
-    m = width8_load_2rows(m_ptr, m_stride);
-
-    // Calculate the difference between a & b
-    temp1 = _mm_subs_epu8(a, b);
-    temp2 = _mm_subs_epu8(b, a);
-    temp1 = _mm_or_si128(temp1, temp2);
-
-    // Multiply by m and add together
-    row_res = _mm_maddubs_epi16(temp1, m);
-
-    // Pad out row result to 32 bit integers & add to running total
-    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
-
-    // Move onto the next rows
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int masked_sad4xh_ssse3(
-    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height) {
-  int y;
-  __m128i a, b, m, temp1, temp2, row_res;
-  __m128i res = _mm_setzero_si128();
-  __m128i one = _mm_set1_epi16(1);
-  // Add the masked SAD for 4 rows at a time
-  for (y = 0; y < height; y += 4) {
-    // Load a, b, m in xmm registers
-    a = width4_load_4rows(a_ptr, a_stride);
-    b = width4_load_4rows(b_ptr, b_stride);
-    m = width4_load_4rows(m_ptr, m_stride);
-
-    // Calculate the difference between a & b
-    temp1 = _mm_subs_epu8(a, b);
-    temp2 = _mm_subs_epu8(b, a);
-    temp1 = _mm_or_si128(temp1, temp2);
-
-    // Multiply by m and add together
-    row_res = _mm_maddubs_epi16(temp1, m);
-
-    // Pad out row result to 32 bit integers & add to running total
-    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
-
-    // Move onto the next rows
-    a_ptr += a_stride * 4;
-    b_ptr += b_stride * 4;
-    m_ptr += m_stride * 4;
-  }
-  // Pad out row result to 32 bit integers & add to running total
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
-                                               int stride) {
-  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
-  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
-  return _mm_unpacklo_epi64(temp1, temp2);
-}
-
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int width, int height);
-
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height);
-
-#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
-  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk,     \
-                                   msk_stride, m, n);                         \
-  }
-
-#if CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(128, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 128)
-#endif  // CONFIG_EXT_PARTITION
-HIGHBD_MASKSADMXN_SSSE3(64, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 64)
-HIGHBD_MASKSADMXN_SSSE3(32, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 32)
-HIGHBD_MASKSADMXN_SSSE3(16, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 16)
-HIGHBD_MASKSADMXN_SSSE3(8, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 4)
-
-#define HIGHBD_MASKSAD4XN_SSSE3(n)                                            \
-  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                            \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *msk, int msk_stride) {                                   \
-    return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,  \
-                                      msk_stride, n);                         \
-  }
-
-HIGHBD_MASKSAD4XN_SSSE3(8)
-HIGHBD_MASKSAD4XN_SSSE3(4)
-
-// For width a multiple of 8
-// Assumes values in m are <=64
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int width, int height) {
-  int y, x;
-  __m128i a, b, m, temp1, temp2;
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
-  __m128i res = _mm_setzero_si128();
-  // For each row
-  for (y = 0; y < height; y++) {
-    // Covering the full width
-    for (x = 0; x < width; x += 8) {
-      // Load a, b, m in xmm registers
-      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
-      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
-      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
-                            _mm_setzero_si128());
-
-      // Calculate the difference between a & b
-      temp1 = _mm_subs_epu16(a, b);
-      temp2 = _mm_subs_epu16(b, a);
-      temp1 = _mm_or_si128(temp1, temp2);
-
-      // Add result of multiplying by m and add pairs together to running total
-      res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
-    }
-    // Move onto the next row
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-
-static INLINE unsigned int highbd_masked_sad4xh_ssse3(
-    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
-    const uint8_t *m_ptr, int m_stride, int height) {
-  int y;
-  __m128i a, b, m, temp1, temp2;
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
-  __m128i res = _mm_setzero_si128();
-  // Add the masked SAD for 2 rows at a time
-  for (y = 0; y < height; y += 2) {
-    // Load a, b, m in xmm registers
-    a = highbd_width4_load_2rows(a_ptr, a_stride);
-    b = highbd_width4_load_2rows(b_ptr, b_stride);
-    temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
-    temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
-    m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
-                          _mm_setzero_si128());
-
-    // Calculate the difference between a & b
-    temp1 = _mm_subs_epu16(a, b);
-    temp2 = _mm_subs_epu16(b, a);
-    temp1 = _mm_or_si128(temp1, temp2);
-
-    // Multiply by m and add together
-    res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
-
-    // Move onto the next rows
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  res = _mm_hadd_epi32(res, _mm_setzero_si128());
-  // sad = (sad + 31) >> 6;
-  return (_mm_cvtsi128_si32(res) + 31) >> 6;
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
--- a/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c
--- a/Show More
+++ b/Show More