Compare commits

..

1 Commits

Author SHA1 Message Date
multicoreware
313e5b9292 Add README_RS
Change-Id: I046b97ce4badb0730283fd44206c9e920cbea290
2014-04-02 15:10:35 -07:00
1471 changed files with 222815 additions and 357401 deletions

View File

@@ -1,91 +0,0 @@
---
Language: Cpp
# BasedOnStyle: Google
# Generated with clang-format 3.8.1
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IncludeCategories:
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Right
ReflowComments: true
SortIncludes: false
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Auto
TabWidth: 8
UseTab: Never
...

89
.gitignore vendored
View File

@@ -1,19 +1,15 @@
*.S
*.a
*.asm.s
*.d
*.gcda
*.gcno
*.gcda
*.o
*~
.cproject
.project
.settings
/*.ivf
/*.ivf.md5
/*-*.mk
/*.asm
/*.doxy
/*.ivf
/*.ivf.md5
/.bins
/.deps
/.docs
@@ -22,43 +18,58 @@
/Makefile
/config.log
/config.mk
/decode_to_md5
/decode_to_md5.c
/decode_to_md5.dox
/decode_with_drops
/decode_with_drops.c
/decode_with_drops.dox
/docs/
/doxyfile
/examples/*.dox
/examples/decode_to_md5
/examples/decode_with_drops
/examples/decode_with_partial_drops
/examples/example_xma
/examples/lossless_encoder
/examples/postproc
/examples/resize_util
/examples/set_maps
/examples/simple_decoder
/examples/simple_encoder
/examples/twopass_encoder
/examples/aom_cx_set_ref
/examples/av1_spatial_scalable_encoder
/examples/aom_temporal_scalable_patterns
/examples/aom_temporal_svc_encoder
/error_resilient
/error_resilient.c
/error_resilient.dox
/force_keyframe
/force_keyframe.c
/force_keyframe.dox
/ivfdec
/ivfdec.dox
/ivfenc
/ivfenc.dox
/libaom.so*
/libaom.ver
/libvpx.so*
/libvpx.ver
/obj_int_extract
/postproc
/postproc.c
/postproc.dox
/samples.dox
/test_intra_pred_speed
/test_libaom
/aom_api1_migration.dox
/av1_rtcd.h
/aom.pc
/aom_config.c
/aom_config.h
/aom_dsp_rtcd.h
/aom_scale_rtcd.h
/aom_version.h
/aomdec
/aomdec.dox
/aomenc
/aomenc.dox
/simple_decoder
/simple_decoder.c
/simple_decoder.dox
/simple_encoder
/simple_encoder.c
/simple_encoder.dox
/test_libvpx
/twopass_encoder
/twopass_encoder.c
/twopass_encoder.dox
/vp8_api1_migration.dox
/vp8_scalable_patterns
/vp8_scalable_patterns.dox
/vp8_set_maps
/vp8_set_maps.c
/vp8_set_maps.dox
/vp8cx_set_ref
/vp8cx_set_ref.c
/vp8cx_set_ref.dox
/vpx.pc
/vpx_config.c
/vpx_config.h
/vpx_rtcd.h
/vpx_version.h
/vpxdec
/vpxenc
TAGS
.cproject
.project
.settings

View File

@@ -1,32 +1,18 @@
Adrian Grange <agrange@google.com>
Aex Converse <aconverse@google.com>
Aex Converse <aconverse@google.com> <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
Alpha Lam <hclam@google.com> <hclam@chromium.org>
Deb Mukherjee <debargha@google.com>
Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
Hangyu Kuang <hkuang@google.com>
Hui Su <huisu@google.com>
Jacky Chen <jackychen@google.com>
Jim Bankoski <jimbankoski@google.com>
John Koleszar <jkoleszar@google.com>
Johann Koenig <johannkoenig@google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
John Koleszar <jkoleszar@google.com>
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
Marco Paniconi <marpan@google.com>
Marco Paniconi <marpan@google.com> <marpan@chromium.org>
Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
Pascal Massimino <pascal.massimino@gmail.com>
Paul Wilkins <paulwilkins@google.com>
Ralph Giles <giles@xiph.org> <giles@entropywave.com>
Ralph Giles <giles@xiph.org> <giles@mozilla.com>
Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
Sami Pietilä <samipietila@google.com>
Tamar Levy <tamar.levy@intel.com>
Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
Tom Finegan <tomfinegan@google.com>
Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
Ralph Giles <giles@xiph.org> <giles@entropywave.com>
Ralph Giles <giles@xiph.org> <giles@mozilla.com>
Alpha Lam <hclam@google.com> <hclam@chromium.org>
Deb Mukherjee <debargha@google.com>
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>

56
AUTHORS
View File

@@ -3,11 +3,10 @@
Aaron Watry <awatry@gmail.com>
Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
Adam Xu <adam@xuyaowu.com>
Adrian Grange <agrange@google.com>
Aex Converse <aconverse@google.com>
Ahmad Sharif <asharif@google.com>
Alexander Voronov <avoronov@graphics.cs.msu.ru>
Alex Converse <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Alpha Lam <hclam@google.com>
@@ -15,68 +14,44 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
Ami Fischman <fischman@chromium.org>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
Andrew Russell <anrussell@google.com>
Angie Chiang <angiebird@google.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Brion Vibber <bvibber@wikimedia.org>
changjun.yang <changjun.yang@intel.com>
Charles 'Buck' Krasic <ckrasic@google.com>
chm <chm@rock-chips.com>
Christian Duvivier <cduvivier@google.com>
Daniel Kang <ddkang@google.com>
Deb Mukherjee <debargha@google.com>
Dim Temp <dimtemp0@gmail.com>
Dmitry Kovalev <dkovalev@google.com>
Dragan Mrdjan <dmrdjan@mips.com>
Ed Baker <edward.baker@intel.com>
Ehsan Akhgari <ehsan.akhgari@gmail.com>
Erik Niemeyer <erik.a.niemeyer@intel.com>
Erik Niemeyer <erik.a.niemeyer@gmail.com>
Fabio Pedretti <fabio.ped@libero.it>
Frank Galligan <fgalligan@google.com>
Fredrik Söderquist <fs@opera.com>
Fritz Koenig <frkoenig@google.com>
Gaute Strokkenes <gaute.strokkenes@broadcom.com>
Geza Lore <gezalore@gmail.com>
Ghislain MARY <ghislainmary2@gmail.com>
Giuseppe Scrivano <gscrivano@gnu.org>
Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
Guillaume Martres <gmartres@google.com>
Guillermo Ballester Valor <gbvalor@gmail.com>
Hangyu Kuang <hkuang@google.com>
Hanno Böck <hanno@hboeck.de>
Henrik Lundin <hlundin@google.com>
Hui Su <huisu@google.com>
Ivan Maltz <ivanmaltz@google.com>
Jacek Caban <cjacek@gmail.com>
Jacky Chen <jackychen@google.com>
James Berry <jamesberry@google.com>
James Yu <james.yu@linaro.org>
James Zern <jzern@google.com>
Jan Gerber <j@mailb.org>
Jan Kratochvil <jan.kratochvil@redhat.com>
Janne Salonen <jsalonen@google.com>
Jean-Marc Valin <jmvalin@jmvalin.ca>
Jeff Faust <jfaust@google.com>
Jeff Muizelaar <jmuizelaar@mozilla.com>
Jeff Petkau <jpet@chromium.org>
Jia Jia <jia.jia@linaro.org>
Jian Zhou <zhoujian@google.com>
Jim Bankoski <jimbankoski@google.com>
Jingning Han <jingning@google.com>
Joey Parrish <joeyparrish@google.com>
Johann Koenig <johannkoenig@chromium.org>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
Johnny Klonaris <google@jawknee.com>
John Stark <jhnstrk@gmail.com>
Joshua Bleecher Snyder <josh@treelinelabs.com>
Joshua Litt <joshualitt@google.com>
Julia Robson <juliamrobson@gmail.com>
Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
KO Myung-Hun <komh@chollian.net>
Lawrence Velázquez <larryv@macports.org>
Lou Quillio <louquillio@google.com>
Luca Barbato <lu_zero@gentoo.org>
Makoto Kato <makoto.kt@gmail.com>
@@ -90,55 +65,36 @@ Michael Kohler <michaelkohler@live.com>
Mike Frysinger <vapier@chromium.org>
Mike Hommey <mhommey@mozilla.com>
Mikhal Shemer <mikhal@google.com>
Minghai Shang <minghai@google.com>
Morton Jonuschat <yabawock@gmail.com>
Nathan E. Egge <negge@dgql.org>
Nico Weber <thakis@chromium.org>
Parag Salasakar <img.mips1@gmail.com>
Pascal Massimino <pascal.massimino@gmail.com>
Patrik Westin <patrik.westin@gmail.com>
Paul Wilkins <paulwilkins@google.com>
Pavol Rusnak <stick@gk2.sk>
Paweł Hajdan <phajdan@google.com>
Pengchong Jin <pengchong@google.com>
Peter de Rivaz <peter.derivaz@argondesign.com>
Peter de Rivaz <peter.derivaz@gmail.com>
Philip Jägenstedt <philipj@opera.com>
Priit Laes <plaes@plaes.org>
Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
Rafaël Carré <funman@videolan.org>
Ralph Giles <giles@xiph.org>
Rob Bradford <rob@linux.intel.com>
Ronald S. Bultje <rsbultje@gmail.com>
Rui Ueyama <ruiu@google.com>
Ronald S. Bultje <rbultje@google.com>
Sami Pietilä <samipietila@google.com>
Sasi Inguva <isasi@google.com>
Scott Graham <scottmg@chromium.org>
Scott LaVarnway <slavarnway@google.com>
Sean McGovern <gseanmcg@gmail.com>
Sergey Kolomenkin <kolomenkin@gmail.com>
Sergey Ulanov <sergeyu@chromium.org>
Shimon Doodkin <helpmepro1@gmail.com>
Shunyao Li <shunyaoli@google.com>
Stefan Holmer <holmer@google.com>
Steinar Midtskogen <stemidts@cisco.com>
Suman Sunkara <sunkaras@google.com>
Taekhyun Kim <takim@nvidia.com>
Takanori MATSUURA <t.matsuu@gmail.com>
Tamar Levy <tamar.levy@intel.com>
Tao Bai <michaelbai@chromium.org>
Tero Rintaluoma <teror@google.com>
Thijs Vermeir <thijsvermeir@gmail.com>
Thomas Daede <tdaede@mozilla.com>
Thomas Davies <thdavies@cisco.com>
Thomas <thdavies@cisco.com>
Tim Kopp <tkopp@google.com>
Timothy B. Terriberry <tterribe@xiph.org>
Tom Finegan <tomfinegan@google.com>
Tristan Matthews <le.businessman@gmail.com>
Tristan Matthews <tmatth@videolan.org>
Vignesh Venkatasubramanian <vigneshv@google.com>
Yaowu Xu <yaowu@google.com>
Yongzhe Wang <yongzhe@google.com>
Yunqing Wang <yunqingwang@google.com>
Zoe Liu <zoeliu@google.com>
Google Inc.
The Mozilla Foundation
The Xiph.Org Foundation

View File

@@ -1,49 +1,3 @@
Next Release
- Incompatible changes:
The AV1 encoder's default keyframe interval changed to 128 from 9999.
2016-04-07 v0.1.0 "AOMedia Codec 1"
This release is the first Alliance for Open Media codec.
2015-11-09 v1.5.0 "Javan Whistling Duck"
This release improves upon the VP9 encoder and speeds up the encoding and
decoding processes.
- Upgrading:
This release is ABI incompatible with 1.4.0. It drops deprecated VP8
controls and adds a variety of VP9 controls for testing.
The vpxenc utility now prefers VP9 by default.
- Enhancements:
Faster VP9 encoding and decoding
Smaller library size by combining functions used by VP8 and VP9
- Bug Fixes:
A variety of fuzzing issues
2015-04-03 v1.4.0 "Indian Runner Duck"
This release includes significant improvements to the VP9 codec.
- Upgrading:
This release is ABI incompatible with 1.3.0. It drops the compatibility
layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
controls for VP9.
- Enhancements:
Faster VP9 encoding and decoding
Multithreaded VP9 decoding (tile and frame-based)
Multithreaded VP9 encoding - on by default
YUV 4:2:2 and 4:4:4 support in VP9
10 and 12bit support in VP9
64bit ARM support by replacing ARM assembly with intrinsics
- Bug Fixes:
Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
files.
- Known Issues:
Frame Parallel decoding fails for segmented and non-420 files.
2013-11-15 v1.3.0 "Forest"
This release introduces the VP9 codec in a backward-compatible way.
All existing users of VP8 can continue to use the library without

View File

@@ -1,270 +0,0 @@
##
## Copyright (c) 2016, Alliance for Open Media. All rights reserved
##
## This source code is subject to the terms of the BSD 2 Clause License and
## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
## was not distributed with this source code in the LICENSE file, you can
## obtain it at www.aomedia.org/license/software. If the Alliance for Open
## Media Patent License 1.0 was not distributed with this source code in the
## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
##
cmake_minimum_required(VERSION 3.2)
project(AOM C CXX)
set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
set(AOM_SRCS
"${AOM_CONFIG_DIR}/aom_config.c"
"${AOM_CONFIG_DIR}/aom_config.h"
"${AOM_ROOT}/aom/aom.h"
"${AOM_ROOT}/aom/aom_codec.h"
"${AOM_ROOT}/aom/aom_decoder.h"
"${AOM_ROOT}/aom/aom_encoder.h"
"${AOM_ROOT}/aom/aom_frame_buffer.h"
"${AOM_ROOT}/aom/aom_image.h"
"${AOM_ROOT}/aom/aom_integer.h"
"${AOM_ROOT}/aom/aomcx.h"
"${AOM_ROOT}/aom/aomdx.h"
"${AOM_ROOT}/aom/internal/aom_codec_internal.h"
"${AOM_ROOT}/aom/src/aom_codec.c"
"${AOM_ROOT}/aom/src/aom_decoder.c"
"${AOM_ROOT}/aom/src/aom_encoder.c"
"${AOM_ROOT}/aom/src/aom_image.c")
set(AOM_DSP_SRCS
"${AOM_ROOT}/aom_dsp/aom_convolve.c"
"${AOM_ROOT}/aom_dsp/aom_convolve.h"
"${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
"${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
"${AOM_ROOT}/aom_dsp/aom_filter.h"
"${AOM_ROOT}/aom_dsp/aom_simd.c"
"${AOM_ROOT}/aom_dsp/aom_simd.h"
"${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
"${AOM_ROOT}/aom_dsp/avg.c"
"${AOM_ROOT}/aom_dsp/bitreader.h"
"${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
"${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
"${AOM_ROOT}/aom_dsp/bitwriter.h"
"${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
"${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
"${AOM_ROOT}/aom_dsp/blend.h"
"${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
"${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
"${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
"${AOM_ROOT}/aom_dsp/dkboolreader.c"
"${AOM_ROOT}/aom_dsp/dkboolreader.h"
"${AOM_ROOT}/aom_dsp/dkboolwriter.c"
"${AOM_ROOT}/aom_dsp/dkboolwriter.h"
"${AOM_ROOT}/aom_dsp/fwd_txfm.c"
"${AOM_ROOT}/aom_dsp/fwd_txfm.h"
"${AOM_ROOT}/aom_dsp/intrapred.c"
"${AOM_ROOT}/aom_dsp/inv_txfm.c"
"${AOM_ROOT}/aom_dsp/inv_txfm.h"
"${AOM_ROOT}/aom_dsp/loopfilter.c"
"${AOM_ROOT}/aom_dsp/prob.c"
"${AOM_ROOT}/aom_dsp/prob.h"
"${AOM_ROOT}/aom_dsp/psnr.c"
"${AOM_ROOT}/aom_dsp/psnr.h"
"${AOM_ROOT}/aom_dsp/quantize.c"
"${AOM_ROOT}/aom_dsp/quantize.h"
"${AOM_ROOT}/aom_dsp/sad.c"
"${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
"${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
"${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
"${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
"${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
"${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
"${AOM_ROOT}/aom_dsp/subtract.c"
"${AOM_ROOT}/aom_dsp/txfm_common.h"
"${AOM_ROOT}/aom_dsp/variance.c"
"${AOM_ROOT}/aom_dsp/variance.h")
set(AOM_MEM_SRCS
"${AOM_ROOT}/aom_mem/aom_mem.c"
"${AOM_ROOT}/aom_mem/aom_mem.h"
"${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
set(AOM_SCALE_SRCS
"${AOM_ROOT}/aom_scale/aom_scale.h"
"${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
"${AOM_ROOT}/aom_scale/generic/aom_scale.c"
"${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
"${AOM_ROOT}/aom_scale/generic/yv12config.c"
"${AOM_ROOT}/aom_scale/generic/yv12extend.c"
"${AOM_ROOT}/aom_scale/yv12config.h")
# TODO(tomfinegan): Extract aom_ports from aom_util if possible.
set(AOM_UTIL_SRCS
"${AOM_ROOT}/aom_ports/aom_once.h"
"${AOM_ROOT}/aom_ports/aom_timer.h"
"${AOM_ROOT}/aom_ports/bitops.h"
"${AOM_ROOT}/aom_ports/emmintrin_compat.h"
"${AOM_ROOT}/aom_ports/mem.h"
"${AOM_ROOT}/aom_ports/mem_ops.h"
"${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
"${AOM_ROOT}/aom_ports/msvc.h"
"${AOM_ROOT}/aom_ports/system_state.h"
"${AOM_ROOT}/aom_util/aom_thread.c"
"${AOM_ROOT}/aom_util/aom_thread.h"
"${AOM_ROOT}/aom_util/endian_inl.h")
set(AOM_AV1_COMMON_SRCS
"${AOM_ROOT}/av1/av1_iface_common.h"
"${AOM_ROOT}/av1/common/alloccommon.c"
"${AOM_ROOT}/av1/common/alloccommon.h"
"${AOM_ROOT}/av1/common/av1_fwd_txfm.c"
"${AOM_ROOT}/av1/common/av1_fwd_txfm.h"
"${AOM_ROOT}/av1/common/av1_inv_txfm.c"
"${AOM_ROOT}/av1/common/av1_inv_txfm.h"
"${AOM_ROOT}/av1/common/av1_rtcd.c"
"${AOM_ROOT}/av1/common/blockd.c"
"${AOM_ROOT}/av1/common/blockd.h"
"${AOM_ROOT}/av1/common/common.h"
"${AOM_ROOT}/av1/common/common_data.h"
"${AOM_ROOT}/av1/common/convolve.c"
"${AOM_ROOT}/av1/common/convolve.h"
"${AOM_ROOT}/av1/common/debugmodes.c"
"${AOM_ROOT}/av1/common/entropy.c"
"${AOM_ROOT}/av1/common/entropy.h"
"${AOM_ROOT}/av1/common/entropymode.c"
"${AOM_ROOT}/av1/common/entropymode.h"
"${AOM_ROOT}/av1/common/entropymv.c"
"${AOM_ROOT}/av1/common/entropymv.h"
"${AOM_ROOT}/av1/common/enums.h"
"${AOM_ROOT}/av1/common/filter.c"
"${AOM_ROOT}/av1/common/filter.h"
"${AOM_ROOT}/av1/common/frame_buffers.c"
"${AOM_ROOT}/av1/common/frame_buffers.h"
"${AOM_ROOT}/av1/common/idct.c"
"${AOM_ROOT}/av1/common/idct.h"
"${AOM_ROOT}/av1/common/loopfilter.c"
"${AOM_ROOT}/av1/common/loopfilter.h"
"${AOM_ROOT}/av1/common/mv.h"
"${AOM_ROOT}/av1/common/mvref_common.c"
"${AOM_ROOT}/av1/common/mvref_common.h"
"${AOM_ROOT}/av1/common/odintrin.c"
"${AOM_ROOT}/av1/common/odintrin.h"
"${AOM_ROOT}/av1/common/onyxc_int.h"
"${AOM_ROOT}/av1/common/pred_common.c"
"${AOM_ROOT}/av1/common/pred_common.h"
"${AOM_ROOT}/av1/common/quant_common.c"
"${AOM_ROOT}/av1/common/quant_common.h"
"${AOM_ROOT}/av1/common/reconinter.c"
"${AOM_ROOT}/av1/common/reconinter.h"
"${AOM_ROOT}/av1/common/reconintra.c"
"${AOM_ROOT}/av1/common/reconintra.h"
"${AOM_ROOT}/av1/common/scale.c"
"${AOM_ROOT}/av1/common/scale.h"
"${AOM_ROOT}/av1/common/scan.c"
"${AOM_ROOT}/av1/common/scan.h"
"${AOM_ROOT}/av1/common/seg_common.c"
"${AOM_ROOT}/av1/common/seg_common.h"
"${AOM_ROOT}/av1/common/thread_common.c"
"${AOM_ROOT}/av1/common/thread_common.h"
"${AOM_ROOT}/av1/common/tile_common.c"
"${AOM_ROOT}/av1/common/tile_common.h")
set(AOM_AV1_DECODER_SRCS
"${AOM_ROOT}/av1/av1_dx_iface.c"
"${AOM_ROOT}/av1/decoder/decodeframe.c"
"${AOM_ROOT}/av1/decoder/decodeframe.h"
"${AOM_ROOT}/av1/decoder/decodemv.c"
"${AOM_ROOT}/av1/decoder/decodemv.h"
"${AOM_ROOT}/av1/decoder/decoder.c"
"${AOM_ROOT}/av1/decoder/decoder.h"
"${AOM_ROOT}/av1/decoder/detokenize.c"
"${AOM_ROOT}/av1/decoder/detokenize.h"
"${AOM_ROOT}/av1/decoder/dsubexp.c"
"${AOM_ROOT}/av1/decoder/dsubexp.h"
"${AOM_ROOT}/av1/decoder/dthread.c"
"${AOM_ROOT}/av1/decoder/dthread.h")
set(AOM_AV1_ENCODER_SRCS
"${AOM_ROOT}/av1/av1_cx_iface.c"
"${AOM_ROOT}/av1/encoder/aq_complexity.c"
"${AOM_ROOT}/av1/encoder/aq_complexity.h"
"${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
"${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
"${AOM_ROOT}/av1/encoder/aq_variance.c"
"${AOM_ROOT}/av1/encoder/aq_variance.h"
"${AOM_ROOT}/av1/encoder/bitstream.c"
"${AOM_ROOT}/av1/encoder/bitstream.h"
"${AOM_ROOT}/av1/encoder/block.h"
"${AOM_ROOT}/av1/encoder/context_tree.c"
"${AOM_ROOT}/av1/encoder/context_tree.h"
"${AOM_ROOT}/av1/encoder/cost.c"
"${AOM_ROOT}/av1/encoder/cost.h"
"${AOM_ROOT}/av1/encoder/dct.c"
"${AOM_ROOT}/av1/encoder/encodeframe.c"
"${AOM_ROOT}/av1/encoder/encodeframe.h"
"${AOM_ROOT}/av1/encoder/encodemb.c"
"${AOM_ROOT}/av1/encoder/encodemb.h"
"${AOM_ROOT}/av1/encoder/encodemv.c"
"${AOM_ROOT}/av1/encoder/encodemv.h"
"${AOM_ROOT}/av1/encoder/encoder.c"
"${AOM_ROOT}/av1/encoder/encoder.h"
"${AOM_ROOT}/av1/encoder/ethread.c"
"${AOM_ROOT}/av1/encoder/ethread.h"
"${AOM_ROOT}/av1/encoder/extend.c"
"${AOM_ROOT}/av1/encoder/extend.h"
"${AOM_ROOT}/av1/encoder/firstpass.c"
"${AOM_ROOT}/av1/encoder/firstpass.h"
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
"${AOM_ROOT}/av1/encoder/lookahead.c"
"${AOM_ROOT}/av1/encoder/lookahead.h"
"${AOM_ROOT}/av1/encoder/mbgraph.c"
"${AOM_ROOT}/av1/encoder/mbgraph.h"
"${AOM_ROOT}/av1/encoder/mcomp.c"
"${AOM_ROOT}/av1/encoder/mcomp.h"
"${AOM_ROOT}/av1/encoder/picklpf.c"
"${AOM_ROOT}/av1/encoder/picklpf.h"
"${AOM_ROOT}/av1/encoder/quantize.c"
"${AOM_ROOT}/av1/encoder/quantize.h"
"${AOM_ROOT}/av1/encoder/ratectrl.c"
"${AOM_ROOT}/av1/encoder/ratectrl.h"
"${AOM_ROOT}/av1/encoder/rd.c"
"${AOM_ROOT}/av1/encoder/rd.h"
"${AOM_ROOT}/av1/encoder/rdopt.c"
"${AOM_ROOT}/av1/encoder/rdopt.h"
"${AOM_ROOT}/av1/encoder/resize.c"
"${AOM_ROOT}/av1/encoder/resize.h"
"${AOM_ROOT}/av1/encoder/segmentation.c"
"${AOM_ROOT}/av1/encoder/segmentation.h"
"${AOM_ROOT}/av1/encoder/speed_features.c"
"${AOM_ROOT}/av1/encoder/speed_features.h"
"${AOM_ROOT}/av1/encoder/subexp.c"
"${AOM_ROOT}/av1/encoder/subexp.h"
"${AOM_ROOT}/av1/encoder/temporal_filter.c"
"${AOM_ROOT}/av1/encoder/temporal_filter.h"
"${AOM_ROOT}/av1/encoder/tokenize.c"
"${AOM_ROOT}/av1/encoder/tokenize.h"
"${AOM_ROOT}/av1/encoder/treewriter.c"
"${AOM_ROOT}/av1/encoder/treewriter.h")
# Targets
add_library(aom_dsp ${AOM_DSP_SRCS})
include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
add_library(aom_mem ${AOM_MEM_SRCS})
add_library(aom_scale ${AOM_SCALE_SRCS})
include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
add_library(aom_util ${AOM_UTIL_SRCS})
add_library(aom_av1_decoder ${AOM_AV1_DECODER_SRCS})
add_library(aom_av1_encoder ${AOM_AV1_ENCODER_SRCS})
add_library(aom ${AOM_SRCS})
target_link_libraries(aom LINK_PUBLIC
aom_dsp
aom_mem
aom_scale
aom_util
aom_av1_decoder
aom_av1_encoder)
add_executable(simple_decoder examples/simple_decoder.c)
include_directories(${AOM_ROOT})
target_link_libraries(simple_decoder LINK_PUBLIC aom)
add_executable(simple_encoder examples/simple_encoder.c)
include_directories(${AOM_ROOT})
target_link_libraries(simple_encoder LINK_PUBLIC aom)

42
LICENSE
View File

@@ -1,27 +1,31 @@
Copyright (c) 2016, Alliance for Open Media. All rights reserved.
Copyright (c) 2010, The WebM Project authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Google, nor the WebM Project, nor the names
of its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

126
PATENTS
View File

@@ -1,108 +1,22 @@
Alliance for Open Media Patent License 1.0
Additional IP Rights Grant (Patents)
1. License Terms.
1.1. Patent License. Subject to the terms and conditions of this License, each
Licensor, on behalf of itself and successors in interest and assigns,
grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as expressly stated in this
License) patent license to its Necessary Claims to make, use, sell, offer
for sale, import or distribute any Implementation.
1.2. Conditions.
1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
sell, offer for sale, import or distribute an Implementation under
Section 1.1, Licensee must make its Necessary Claims available under
this License, and must reproduce this License with any Implementation
as follows:
a. For distribution in source code, by including this License in the
root directory of the source code with its Implementation.
b. For distribution in any other form (including binary, object form,
and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
GDSII, etc.)), by including this License in the documentation, legal
notices, and/or other written materials provided with the
Implementation.
1.2.2. Additional Conditions. This license is directly from Licensor to
Licensee. Licensee acknowledges as a condition of benefiting from it
that no rights from Licensor are received from suppliers, distributors,
or otherwise in connection with this License.
1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
initiates patent litigation or files, maintains, or voluntarily
participates in a lawsuit against another entity or any person asserting
that any Implementation infringes Necessary Claims, any patent licenses
granted under this License directly to the Licensee are immediately
terminated as of the date of the initiation of action unless 1) that suit
was in response to a corresponding suit regarding an Implementation first
brought against an initiating entity, or 2) that suit was brought to
enforce the terms of this License (including intervention in a third-party
action by a Licensee).
1.4. Disclaimers. The Reference Implementation and Specification are provided
"AS IS" and without warranty. The entire risk as to implementing or
otherwise using the Reference Implementation or Specification is assumed
by the implementer and user. Licensor expressly disclaims any warranties
(express, implied, or otherwise), including implied warranties of
merchantability, non-infringement, fitness for a particular purpose, or
title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2. Definitions.
2.1. Affiliate. <20>Affiliate<74> means an entity that directly or indirectly
Controls, is Controlled by, or is under common Control of that party.
2.2. Control. <20>Control<6F> means direct or indirect control of more than 50% of
the voting power to elect directors of that corporation, or for any other
entity, the power to direct management of such entity.
2.3. Decoder. "Decoder" means any decoder that conforms fully with all
non-optional portions of the Specification.
2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can
be decoded by a Decoder only to the extent it produces such a bitstream.
2.5. Final Deliverable. <20>Final Deliverable<6C> means the final version of a
deliverable approved by the Alliance for Open Media as a Final
Deliverable.
2.6. Implementation. "Implementation" means any implementation, including the
Reference Implementation, that is an Encoder and/or a Decoder. An
Implementation also includes components of an Implementation only to the
extent they are used as part of an Implementation.
2.7. License. <20>License<73> means this license.
2.8. Licensee. <20>Licensee<65> means any person or entity who exercises patent
rights granted under this License.
2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers
for sale, imports or distributes any Implementation, or (ii) a person
or entity that has a licensing obligation to the Implementation as a
result of its membership and/or participation in the Alliance for Open
Media working group that developed the Specification.
2.10. Necessary Claims. "Necessary Claims" means all claims of patents or
patent applications, (a) that currently or at any time in the future,
are owned or controlled by the Licensor, and (b) (i) would be an
Essential Claim as defined by the W3C Policy as of February 5, 2004
(https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
as if the Specification was a W3C Recommendation; or (ii) are infringed
by the Reference Implementation.
2.11. Reference Implementation. <20>Reference Implementation<6F> means an Encoder
and/or Decoder released by the Alliance for Open Media as a Final
Deliverable.
2.12. Specification. <20>Specification<6F> means the specification designated by
the Alliance for Open Media as a Final Deliverable for which this
License was issued.
"This implementation" means the copyrightable works distributed by
Google as part of the WebM Project.
Google hereby grants to you a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer, and otherwise run, modify and propagate the contents of this
implementation of VP8, where such license applies only to those patent
claims, both currently owned by Google and acquired in the future,
licensable by Google that are necessarily infringed by this
implementation of VP8. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of VP8 or any code incorporated within this
implementation of VP8 constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of VP8
shall terminate as of the date such litigation is filed.

83
README
View File

@@ -1,6 +1,7 @@
README - 23 March 2015
vpx Multi-Format Codec SDK
README - 1 August 2013
Welcome to the WebM VP8/AV1 Codec SDK!
Welcome to the WebM VP8/VP9 Codec SDK!
COMPILING THE APPLICATIONS/LIBRARIES:
The build system used is similar to autotools. Building generally consists of
@@ -11,20 +12,22 @@ COMPILING THE APPLICATIONS/LIBRARIES:
* All x86 targets require the Yasm[1] assembler be installed.
* All Windows builds require that Cygwin[2] be installed.
* Building the documentation requires Doxygen[3]. If you do not
have this package, the install-docs option will be disabled.
* Downloading the data for the unit tests requires curl[4] and sha1sum.
* Building the documentation requires PHP[3] and Doxygen[4]. If you do not
have these packages, you must pass --disable-install-docs to the
configure script.
* Downloading the data for the unit tests requires curl[5] and sha1sum.
sha1sum is provided via the GNU coreutils, installed by default on
many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
available, a compatible version of sha1sum can be built from
source[5]. These requirements are optional if not running the unit
source[6]. These requirements are optional if not running the unit
tests.
[1]: http://www.tortall.net/projects/yasm
[2]: http://www.cygwin.com
[3]: http://www.doxygen.org
[4]: http://curl.haxx.se
[5]: http://www.microbrew.org/tools/md5sha1sum/
[3]: http://php.net
[4]: http://www.doxygen.org
[5]: http://curl.haxx.se
[6]: http://www.microbrew.org/tools/md5sha1sum/
2. Out-of-tree builds
Out of tree builds are a supported method of building the application. For
@@ -33,13 +36,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
$ mkdir build
$ cd build
$ ../libaom/configure <options>
$ ../libvpx/configure <options>
$ make
3. Configuration options
The 'configure' script supports a number of options. The --help option can be
used to get a list of supported options:
$ ../libaom/configure --help
$ ../libvpx/configure --help
4. Cross development
For cross development, the most notable option is the --target option. The
@@ -47,10 +50,14 @@ COMPILING THE APPLICATIONS/LIBRARIES:
--help output of the configure script. As of this writing, the list of
available targets is:
armv5te-android-gcc
armv5te-linux-rvct
armv5te-linux-gcc
armv5te-none-rvct
armv6-darwin-gcc
armv6-linux-rvct
armv6-linux-gcc
armv6-none-rvct
arm64-darwin-gcc
armv7-android-gcc
armv7-darwin-gcc
armv7-linux-rvct
@@ -58,10 +65,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
armv7-none-rvct
armv7-win32-vs11
armv7-win32-vs12
armv7-win32-vs14
armv7s-darwin-gcc
mips32-linux-gcc
mips64-linux-gcc
ppc32-darwin8-gcc
ppc32-darwin9-gcc
ppc32-linux-gcc
ppc64-darwin8-gcc
ppc64-darwin9-gcc
ppc64-linux-gcc
sparc-solaris-gcc
x86-android-gcc
x86-darwin8-gcc
@@ -72,33 +82,37 @@ COMPILING THE APPLICATIONS/LIBRARIES:
x86-darwin11-gcc
x86-darwin12-gcc
x86-darwin13-gcc
x86-darwin14-gcc
x86-iphonesimulator-gcc
x86-linux-gcc
x86-linux-icc
x86-os2-gcc
x86-solaris-gcc
x86-win32-gcc
x86-win32-vs7
x86-win32-vs8
x86-win32-vs9
x86-win32-vs10
x86-win32-vs11
x86-win32-vs12
x86-win32-vs14
x86_64-android-gcc
x86_64-darwin9-gcc
x86_64-darwin10-gcc
x86_64-darwin11-gcc
x86_64-darwin12-gcc
x86_64-darwin13-gcc
x86_64-darwin14-gcc
x86_64-iphonesimulator-gcc
x86_64-linux-gcc
x86_64-linux-icc
x86_64-solaris-gcc
x86_64-win64-gcc
x86_64-win64-vs8
x86_64-win64-vs9
x86_64-win64-vs10
x86_64-win64-vs11
x86_64-win64-vs12
x86_64-win64-vs14
universal-darwin8-gcc
universal-darwin9-gcc
universal-darwin10-gcc
universal-darwin11-gcc
universal-darwin12-gcc
universal-darwin13-gcc
generic-gnu
The generic-gnu target, in conjunction with the CROSS environment variable,
@@ -108,7 +122,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
toolchain, the following command could be used (note, POSIX SH syntax, adapt
to your shell as necessary):
$ CROSS=mipsel-linux-uclibc- ../libaom/configure
$ CROSS=mipsel-linux-uclibc- ../libvpx/configure
In addition, the executables to be invoked can be overridden by specifying the
environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
@@ -119,29 +133,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
This defaults to config.log. This should give a good indication of what went
wrong. If not, contact us for support.
VP8/AV1 TEST VECTORS:
The test vectors can be downloaded and verified using the build system after
running configure. To specify an alternate directory the
LIBAOM_TEST_DATA_PATH environment variable can be used.
$ ./configure --enable-unit-tests
$ LIBAOM_TEST_DATA_PATH=../-test-data make testdata
CODE STYLE:
The coding style used by this project is enforced with clang-format using the
configuration contained in the .clang-format file in the root of the
repository.
Before pushing changes for review you can format your code with:
# Apply clang-format to modified .c, .h and .cc files
$ clang-format -i --style=file \
$(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc')
Check the .clang-format file for the version used to generate it if there is
any difference between your local formatting and the review system.
See also: http://clang.llvm.org/docs/ClangFormat.html
SUPPORT
This library is an open source project supported by its community. Please
please email webm-discuss@webmproject.org for help.

37
README_RS Normal file
View File

@@ -0,0 +1,37 @@
Welcome to read this page!
How to run renderscript in VP9 decode
1. Compile RenderScript Kernel(We use zero copy this version,
if ndk rs module don't support this,
it can't pass compile)
Because RenderScript need g++, but VP9 use gcc.
So we need to compile rs alone as a so library. Then VP9 project load/use it.
RS code is in vp9/common/kernel/rs/, there is a compile.sh in that folder,
We can use sh to compile rs kernel.(need NDK-r9b or higher, we need to config ndk in path).
If compile success, it will gen:
vp9_rs_packing.so(vp9/common/kernel/rs/)
librs.inter_rs.so(vp9/common/kernel/rs/obj/)
libstlport_shared.so(vp9/common/kernel/rs/obj/)
2. Compile VP9 project
Just Compile it normally.
3. Run vpxdec with RenderScript(We use zero copy this version,
if system RS module don't support this,
it won't work)
[1]: push vp9_rs_packing.so to /data/local/tmp/(or /vendor/lib, /system/lib,
need adb remount)
[2]: push librs.inter_rs.so to /data/local/tmp/(or /vendor/lib, /system/lib,
need adb remount)
[3]: push libstlport_shared.so to /data/local/tmp/(or /vendor/lib, /system/lib,
need adb remount)
[4]: push vpxdec to /data/local/tmp
[5]: adb shell (as root)
[6]: cd /data/local/tmp (on device)
[7]: export RSENABLE=1 (on device, if not, it won't use RS)
[8]: export QCTENABLE=1 (on device, enable qct kernel)
[9]: export OPENABLE=1 (on device, enable optimized kernel, dependence on system and driver)
[10]:export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp (on device,
if we push .so file to /data/local/tmp, we need set this)
[11]:run vpxdec normally(on device)

160
aom/aom.h
View File

@@ -1,160 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\defgroup aom AOM
* \ingroup codecs
* AOM is aom's newest video compression algorithm that uses motion
* compensated prediction, Discrete Cosine Transform (DCT) coding of the
* prediction error signal and context dependent entropy coding techniques
* based on arithmetic principles. It features:
* - YUV 4:2:0 image format
* - Macro-block based coding (16x16 luma plus two 8x8 chroma)
* - 1/4 (1/8) pixel accuracy motion compensated prediction
* - 4x4 DCT transform
* - 128 level linear quantizer
* - In loop deblocking filter
* - Context-based entropy coding
*
* @{
*/
/*!\file
* \brief Provides controls common to both the AOM encoder and decoder.
*/
#ifndef AOM_AOM_H_
#define AOM_AOM_H_
#include "./aom_codec.h"
#include "./aom_image.h"
#ifdef __cplusplus
extern "C" {
#endif
/*!\brief Control functions
*
* The set of macros define the control functions of AOM interface
*/
enum aom_com_control_id {
/*!\brief pass in an external frame into decoder to be used as reference frame
*/
AOM_SET_REFERENCE = 1,
AOM_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
AOM_SET_POSTPROC = 3, /**< set the decoder's post processing settings */
AOM_SET_DBG_COLOR_REF_FRAME =
4, /**< set the reference frames to color for each macroblock */
AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
AOM_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */
AOM_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */
/* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
* for its control ids. These should be migrated to something like the
* AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
*/
AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
AOM_COMMON_CTRL_ID_MAX,
AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
AOM_DECODER_CTRL_ID_START = 256
};
/*!\brief post process flags
*
* The set of macros define AOM decoder post processing flags
*/
enum aom_postproc_level {
AOM_NOFILTERING = 0,
AOM_DEBLOCK = 1 << 0,
AOM_DEMACROBLOCK = 1 << 1,
AOM_ADDNOISE = 1 << 2,
AOM_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
AOM_DEBUG_TXT_MBLK_MODES =
1 << 4, /**< print macro block modes over each macro block */
AOM_DEBUG_TXT_DC_DIFF = 1 << 5, /**< print dc diff for each macro block */
AOM_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
AOM_MFQE = 1 << 10
};
/*!\brief post process flags
*
* This define a structure that describe the post processing settings. For
* the best objective measure (using the PSNR metric) set post_proc_flag
* to AOM_DEBLOCK and deblocking_level to 1.
*/
typedef struct aom_postproc_cfg {
/*!\brief the types of post processing to be done, should be combination of
* "aom_postproc_level" */
int post_proc_flag;
int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
int noise_level; /**< the strength of additive noise, valid range [0, 16] */
} aom_postproc_cfg_t;
/*!\brief reference frame type
*
* The set of macros define the type of AOM reference frames
*/
typedef enum aom_ref_frame_type {
AOM_LAST_FRAME = 1,
AOM_GOLD_FRAME = 2,
AOM_ALTR_FRAME = 4
} aom_ref_frame_type_t;
/*!\brief reference frame data struct
*
* Define the data struct to access aom reference frames.
*/
typedef struct aom_ref_frame {
aom_ref_frame_type_t frame_type; /**< which reference frame */
aom_image_t img; /**< reference frame data in image format */
} aom_ref_frame_t;
/*!\brief AV1 specific reference frame data struct
*
* Define the data struct to access av1 reference frames.
*/
typedef struct av1_ref_frame {
int idx; /**< frame index to get (input) */
aom_image_t img; /**< img structure to populate (output) */
} av1_ref_frame_t;
/*!\cond */
/*!\brief aom decoder control function parameter type
*
* defines the data type for each of AOM decoder control function requires
*/
AOM_CTRL_USE_TYPE(AOM_SET_REFERENCE, aom_ref_frame_t *)
#define AOM_CTRL_AOM_SET_REFERENCE
AOM_CTRL_USE_TYPE(AOM_COPY_REFERENCE, aom_ref_frame_t *)
#define AOM_CTRL_AOM_COPY_REFERENCE
AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
#define AOM_CTRL_AOM_SET_POSTPROC
AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
#define AOM_CTRL_AOM_SET_DBG_COLOR_REF_FRAME
AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_MB_MODES, int)
#define AOM_CTRL_AOM_SET_DBG_COLOR_MB_MODES
AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_B_MODES, int)
#define AOM_CTRL_AOM_SET_DBG_COLOR_B_MODES
AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
#define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
#define AOM_CTRL_AV1_GET_REFERENCE
AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
/*!\endcond */
/*! @} - end defgroup aom */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_AOM_H_

View File

@@ -1,487 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\defgroup codec Common Algorithm Interface
* This abstraction allows applications to easily support multiple video
* formats with minimal code duplication. This section describes the interface
* common to all codecs (both encoders and decoders).
* @{
*/
/*!\file
* \brief Describes the codec algorithm interface to applications.
*
* This file describes the interface between an application and a
* video codec algorithm.
*
* An application instantiates a specific codec instance by using
* aom_codec_init() and a pointer to the algorithm's interface structure:
* <pre>
* my_app.c:
* extern aom_codec_iface_t my_codec;
* {
* aom_codec_ctx_t algo;
* res = aom_codec_init(&algo, &my_codec);
* }
* </pre>
*
* Once initialized, the instance is manged using other functions from
* the aom_codec_* family.
*/
#ifndef AOM_AOM_CODEC_H_
#define AOM_AOM_CODEC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "./aom_integer.h"
#include "./aom_image.h"
/*!\brief Decorator indicating a function is deprecated */
#ifndef DEPRECATED
#if defined(__GNUC__) && __GNUC__
#define DEPRECATED __attribute__((deprecated))
#elif defined(_MSC_VER)
#define DEPRECATED
#else
#define DEPRECATED
#endif
#endif /* DEPRECATED */
#ifndef DECLSPEC_DEPRECATED
#if defined(__GNUC__) && __GNUC__
#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
#elif defined(_MSC_VER)
/*!\brief \copydoc #DEPRECATED */
#define DECLSPEC_DEPRECATED __declspec(deprecated)
#else
#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
#endif
#endif /* DECLSPEC_DEPRECATED */
/*!\brief Decorator indicating a function is potentially unused */
#ifdef UNUSED
#elif defined(__GNUC__) || defined(__clang__)
#define UNUSED __attribute__((unused))
#else
#define UNUSED
#endif
/*!\brief Decorator indicating that given struct/union/enum is packed */
#ifndef ATTRIBUTE_PACKED
#if defined(__GNUC__) && __GNUC__
#define ATTRIBUTE_PACKED __attribute__((packed))
#elif defined(_MSC_VER)
#define ATTRIBUTE_PACKED
#else
#define ATTRIBUTE_PACKED
#endif
#endif /* ATTRIBUTE_PACKED */
/*!\brief Current ABI version number
*
* \internal
* If this file is altered in any way that changes the ABI, this value
* must be bumped. Examples include, but are not limited to, changing
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
#define AOM_CODEC_ABI_VERSION (3 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
/*!\brief Algorithm return codes */
typedef enum {
/*!\brief Operation completed without error */
AOM_CODEC_OK,
/*!\brief Unspecified error */
AOM_CODEC_ERROR,
/*!\brief Memory operation failed */
AOM_CODEC_MEM_ERROR,
/*!\brief ABI version mismatch */
AOM_CODEC_ABI_MISMATCH,
/*!\brief Algorithm does not have required capability */
AOM_CODEC_INCAPABLE,
/*!\brief The given bitstream is not supported.
*
* The bitstream was unable to be parsed at the highest level. The decoder
* is unable to proceed. This error \ref SHOULD be treated as fatal to the
* stream. */
AOM_CODEC_UNSUP_BITSTREAM,
/*!\brief Encoded bitstream uses an unsupported feature
*
* The decoder does not implement a feature required by the encoder. This
* return code should only be used for features that prevent future
* pictures from being properly decoded. This error \ref MAY be treated as
* fatal to the stream or \ref MAY be treated as fatal to the current GOP.
*/
AOM_CODEC_UNSUP_FEATURE,
/*!\brief The coded data for this stream is corrupt or incomplete
*
* There was a problem decoding the current frame. This return code
* should only be used for failures that prevent future pictures from
* being properly decoded. This error \ref MAY be treated as fatal to the
* stream or \ref MAY be treated as fatal to the current GOP. If decoding
* is continued for the current GOP, artifacts may be present.
*/
AOM_CODEC_CORRUPT_FRAME,
/*!\brief An application-supplied parameter is not valid.
*
*/
AOM_CODEC_INVALID_PARAM,
/*!\brief An iterator reached the end of list.
*
*/
AOM_CODEC_LIST_END
} aom_codec_err_t;
/*! \brief Codec capabilities bitfield
*
* Each codec advertises the capabilities it supports as part of its
* ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
* or functionality, and are not required to be supported.
*
* The available flags are specified by AOM_CODEC_CAP_* defines.
*/
typedef long aom_codec_caps_t;
#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
#define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
/*! \brief Initialization-time Feature Enabling
*
* Certain codec features must be known at initialization time, to allow for
* proper memory allocation.
*
* The available flags are specified by AOM_CODEC_USE_* defines.
*/
typedef long aom_codec_flags_t;
/*!\brief Codec interface structure.
*
* Contains function pointers and other data private to the codec
* implementation. This structure is opaque to the application.
*/
typedef const struct aom_codec_iface aom_codec_iface_t;
/*!\brief Codec private data structure.
*
* Contains data private to the codec implementation. This structure is opaque
* to the application.
*/
typedef struct aom_codec_priv aom_codec_priv_t;
/*!\brief Iterator
*
* Opaque storage used for iterating over lists.
*/
typedef const void *aom_codec_iter_t;
/*!\brief Codec context structure
*
* All codecs \ref MUST support this context structure fully. In general,
* this data should be considered private to the codec algorithm, and
* not be manipulated or examined by the calling application. Applications
* may reference the 'name' member to get a printable description of the
* algorithm.
*/
typedef struct aom_codec_ctx {
const char *name; /**< Printable interface name */
aom_codec_iface_t *iface; /**< Interface pointers */
aom_codec_err_t err; /**< Last returned error */
const char *err_detail; /**< Detailed info, if available */
aom_codec_flags_t init_flags; /**< Flags passed at init time */
union {
/**< Decoder Configuration Pointer */
const struct aom_codec_dec_cfg *dec;
/**< Encoder Configuration Pointer */
const struct aom_codec_enc_cfg *enc;
const void *raw;
} config; /**< Configuration pointer aliasing union */
aom_codec_priv_t *priv; /**< Algorithm private storage */
} aom_codec_ctx_t;
/*!\brief Bit depth for codec
* *
* This enumeration determines the bit depth of the codec.
*/
typedef enum aom_bit_depth {
AOM_BITS_8 = 8, /**< 8 bits */
AOM_BITS_10 = 10, /**< 10 bits */
AOM_BITS_12 = 12, /**< 12 bits */
} aom_bit_depth_t;
/*!\brief Superblock size selection.
*
* Defines the superblock size used for encoding. The superblock size can
* either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
* selected by the encoder for each frame.
*/
typedef enum aom_superblock_size {
AOM_SUPERBLOCK_SIZE_64X64, /**< Always use 64x64 superblocks. */
AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
AOM_SUPERBLOCK_SIZE_DYNAMIC /**< Select superblock size dynamically. */
} aom_superblock_size_t;
/*
* Library Version Number Interface
*
* For example, see the following sample return values:
* aom_codec_version() (1<<16 | 2<<8 | 3)
* aom_codec_version_str() "v1.2.3-rc1-16-gec6a1ba"
* aom_codec_version_extra_str() "rc1-16-gec6a1ba"
*/
/*!\brief Return the version information (as an integer)
*
* Returns a packed encoding of the library version number. This will only
* include
* the major.minor.patch component of the version number. Note that this encoded
* value should be accessed through the macros provided, as the encoding may
* change
* in the future.
*
*/
int aom_codec_version(void);
#define AOM_VERSION_MAJOR(v) \
((v >> 16) & 0xff) /**< extract major from packed version */
#define AOM_VERSION_MINOR(v) \
((v >> 8) & 0xff) /**< extract minor from packed version */
#define AOM_VERSION_PATCH(v) \
((v >> 0) & 0xff) /**< extract patch from packed version */
/*!\brief Return the version major number */
#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
/*!\brief Return the version minor number */
#define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)
/*!\brief Return the version patch number */
#define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)
/*!\brief Return the version information (as a string)
*
* Returns a printable string containing the full library version number. This
* may
* contain additional text following the three digit version number, as to
* indicate
* release candidates, prerelease versions, etc.
*
*/
const char *aom_codec_version_str(void);
/*!\brief Return the version information (as a string)
*
* Returns a printable "extra string". This is the component of the string
* returned
* by aom_codec_version_str() following the three digit version number.
*
*/
const char *aom_codec_version_extra_str(void);
/*!\brief Return the build configuration
*
* Returns a printable string containing an encoded version of the build
* configuration. This may be useful to aom support.
*
*/
const char *aom_codec_build_config(void);
/*!\brief Return the name for a given interface
*
* Returns a human readable string for name of the given codec interface.
*
* \param[in] iface Interface pointer
*
*/
const char *aom_codec_iface_name(aom_codec_iface_t *iface);
/*!\brief Convert error number to printable string
*
* Returns a human readable string for the last error returned by the
* algorithm. The returned error will be one line and will not contain
* any newline characters.
*
*
* \param[in] err Error number.
*
*/
const char *aom_codec_err_to_string(aom_codec_err_t err);
/*!\brief Retrieve error synopsis for codec context
*
* Returns a human readable string for the last error returned by the
* algorithm. The returned error will be one line and will not contain
* any newline characters.
*
*
* \param[in] ctx Pointer to this instance's context.
*
*/
const char *aom_codec_error(aom_codec_ctx_t *ctx);
/*!\brief Retrieve detailed error information for codec context
*
* Returns a human readable string providing detailed information about
* the last error.
*
* \param[in] ctx Pointer to this instance's context.
*
* \retval NULL
* No detailed information is available.
*/
const char *aom_codec_error_detail(aom_codec_ctx_t *ctx);
/* REQUIRED FUNCTIONS
*
* The following functions are required to be implemented for all codecs.
* They represent the base case functionality expected of all codecs.
*/
/*!\brief Destroy a codec instance
*
* Destroys a codec context, freeing any associated memory buffers.
*
* \param[in] ctx Pointer to this instance's context
*
* \retval #AOM_CODEC_OK
* The codec algorithm initialized.
* \retval #AOM_CODEC_MEM_ERROR
* Memory allocation failed.
*/
aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx);
/*!\brief Get the capabilities of an algorithm.
*
* Retrieves the capabilities bitfield from the algorithm's interface.
*
* \param[in] iface Pointer to the algorithm interface
*
*/
aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);
/*!\brief Control algorithm
*
* This function is used to exchange algorithm specific data with the codec
* instance. This can be used to implement features specific to a particular
* algorithm.
*
* This wrapper function dispatches the request to the helper function
* associated with the given ctrl_id. It tries to call this function
* transparently, but will return #AOM_CODEC_ERROR if the request could not
* be dispatched.
*
* Note that this function should not be used directly. Call the
* #aom_codec_control wrapper macro instead.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] ctrl_id Algorithm specific control identifier
*
* \retval #AOM_CODEC_OK
* The control request was processed.
* \retval #AOM_CODEC_ERROR
* The control request was not processed.
* \retval #AOM_CODEC_INVALID_PARAM
* The data was not valid.
*/
aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
#if defined(AOM_DISABLE_CTRL_TYPECHECKS) && AOM_DISABLE_CTRL_TYPECHECKS
#define aom_codec_control(ctx, id, data) aom_codec_control_(ctx, id, data)
#define AOM_CTRL_USE_TYPE(id, typ)
#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)
#define AOM_CTRL_VOID(id, typ)
#else
/*!\brief aom_codec_control wrapper macro
*
* This macro allows for type safe conversions across the variadic parameter
* to aom_codec_control_().
*
* \internal
* It works by dispatching the call to the control function through a wrapper
* function named with the id parameter.
*/
#define aom_codec_control(ctx, id, data) \
aom_codec_control_##id(ctx, id, data) /**<\hideinitializer*/
/*!\brief aom_codec_control type definition macro
*
* This macro allows for type safe conversions across the variadic parameter
* to aom_codec_control_(). It defines the type of the argument for a given
* control identifier.
*
* \internal
* It defines a static function with
* the correctly typed arguments as a wrapper to the type-unsafe internal
* function.
*/
#define AOM_CTRL_USE_TYPE(id, typ) \
static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int, typ) \
UNUSED; \
\
static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx, \
int ctrl_id, typ data) { \
return aom_codec_control_(ctx, ctrl_id, data); \
} /**<\hideinitializer*/
/*!\brief aom_codec_control deprecated type definition macro
*
* Like #AOM_CTRL_USE_TYPE, but indicates that the specified control is
* deprecated and should not be used. Consult the documentation for your
* codec for more information.
*
* \internal
* It defines a static function with the correctly typed arguments as a
* wrapper to the type-unsafe internal function.
*/
#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ) \
DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
aom_codec_ctx_t *, int, typ) DEPRECATED UNUSED; \
\
DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
aom_codec_ctx_t *ctx, int ctrl_id, typ data) { \
return aom_codec_control_(ctx, ctrl_id, data); \
} /**<\hideinitializer*/
/*!\brief aom_codec_control void type definition macro
*
* This macro allows for type safe conversions across the variadic parameter
* to aom_codec_control_(). It indicates that a given control identifier takes
* no argument.
*
* \internal
* It defines a static function without a data argument as a wrapper to the
* type-unsafe internal function.
*/
#define AOM_CTRL_VOID(id) \
static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int) \
UNUSED; \
\
static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx, \
int ctrl_id) { \
return aom_codec_control_(ctx, ctrl_id); \
} /**<\hideinitializer*/
#endif
/*!@} - end defgroup codec*/
#ifdef __cplusplus
}
#endif
#endif // AOM_AOM_CODEC_H_

View File

@@ -1,42 +0,0 @@
##
## Copyright (c) 2016, Alliance for Open Media. All rights reserved
##
## This source code is subject to the terms of the BSD 2 Clause License and
## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
## was not distributed with this source code in the LICENSE file, you can
## obtain it at www.aomedia.org/license/software. If the Alliance for Open
## Media Patent License 1.0 was not distributed with this source code in the
## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
##
API_EXPORTS += exports
API_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
API_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
API_SRCS-$(CONFIG_AV1_DECODER) += aom.h
API_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aom.h
API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
API_DOC_SRCS-yes += aom_codec.h
API_DOC_SRCS-yes += aom_decoder.h
API_DOC_SRCS-yes += aom_encoder.h
API_DOC_SRCS-yes += aom_frame_buffer.h
API_DOC_SRCS-yes += aom_image.h
API_SRCS-yes += src/aom_decoder.c
API_SRCS-yes += aom_decoder.h
API_SRCS-yes += src/aom_encoder.c
API_SRCS-yes += aom_encoder.h
API_SRCS-yes += internal/aom_codec_internal.h
API_SRCS-yes += src/aom_codec.c
API_SRCS-yes += src/aom_image.c
API_SRCS-yes += aom_codec.h
API_SRCS-yes += aom_codec.mk
API_SRCS-yes += aom_frame_buffer.h
API_SRCS-yes += aom_image.h
API_SRCS-yes += aom_integer.h

View File

@@ -1,366 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_DECODER_H_
#define AOM_AOM_DECODER_H_
/*!\defgroup decoder Decoder Algorithm Interface
* \ingroup codec
* This abstraction allows applications using this decoder to easily support
* multiple video formats with minimal code duplication. This section describes
* the interface common to all decoders.
* @{
*/
/*!\file
* \brief Describes the decoder algorithm interface to applications.
*
* This file describes the interface between an application and a
* video decoder algorithm.
*
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "./aom_codec.h"
#include "./aom_frame_buffer.h"
/*!\brief Current ABI version number
*
* \internal
* If this file is altered in any way that changes the ABI, this value
* must be bumped. Examples include, but are not limited to, changing
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
#define AOM_DECODER_ABI_VERSION \
(3 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Decoder capabilities bitfield
*
* Each decoder advertises the capabilities it supports as part of its
* ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
* or functionality, and are not required to be supported by a decoder.
*
* The available flags are specified by AOM_CODEC_CAP_* defines.
*/
#define AOM_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
#define AOM_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
#define AOM_CODEC_CAP_POSTPROC 0x40000 /**< Can postprocess decoded frame */
/*!\brief Can conceal errors due to packet loss */
#define AOM_CODEC_CAP_ERROR_CONCEALMENT 0x80000
/*!\brief Can receive encoded frames one fragment at a time */
#define AOM_CODEC_CAP_INPUT_FRAGMENTS 0x100000
/*! \brief Initialization-time Feature Enabling
*
* Certain codec features must be known at initialization time, to allow for
* proper memory allocation.
*
* The available flags are specified by AOM_CODEC_USE_* defines.
*/
/*!\brief Can support frame-based multi-threading */
#define AOM_CODEC_CAP_FRAME_THREADING 0x200000
/*!brief Can support external frame buffers */
#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
#define AOM_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
/*!\brief Conceal errors in decoded frames */
#define AOM_CODEC_USE_ERROR_CONCEALMENT 0x20000
/*!\brief The input frame should be passed to the decoder one fragment at a
* time */
#define AOM_CODEC_USE_INPUT_FRAGMENTS 0x40000
/*!\brief Enable frame-based multi-threading */
#define AOM_CODEC_USE_FRAME_THREADING 0x80000
/*!\brief Stream properties
*
* This structure is used to query or set properties of the decoded
* stream. Algorithms may extend this structure with data specific
* to their bitstream by setting the sz member appropriately.
*/
typedef struct aom_codec_stream_info {
unsigned int sz; /**< Size of this structure */
unsigned int w; /**< Width (or 0 for unknown/default) */
unsigned int h; /**< Height (or 0 for unknown/default) */
unsigned int is_kf; /**< Current frame is a keyframe */
} aom_codec_stream_info_t;
/* REQUIRED FUNCTIONS
*
* The following functions are required to be implemented for all decoders.
* They represent the base case functionality expected of all decoders.
*/
/*!\brief Initialization Configurations
*
* This structure is used to pass init time configuration options to the
* decoder.
*/
typedef struct aom_codec_dec_cfg {
unsigned int threads; /**< Maximum number of threads to use, default 1 */
unsigned int w; /**< Width */
unsigned int h; /**< Height */
} aom_codec_dec_cfg_t; /**< alias for struct aom_codec_dec_cfg */
/*!\brief Initialize a decoder instance
*
* Initializes a decoder context using the given interface. Applications
* should call the aom_codec_dec_init convenience macro instead of this
* function directly, to ensure that the ABI version number parameter
* is properly initialized.
*
* If the library was configured with --disable-multithread, this call
* is not thread safe and should be guarded with a lock if being used
* in a multithreaded context.
*
* \param[in] ctx Pointer to this instance's context.
* \param[in] iface Pointer to the algorithm interface to use.
* \param[in] cfg Configuration to use, if known. May be NULL.
* \param[in] flags Bitfield of AOM_CODEC_USE_* flags
* \param[in] ver ABI version number. Must be set to
* AOM_DECODER_ABI_VERSION
* \retval #AOM_CODEC_OK
* The decoder algorithm initialized.
* \retval #AOM_CODEC_MEM_ERROR
* Memory allocation failed.
*/
aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
aom_codec_iface_t *iface,
const aom_codec_dec_cfg_t *cfg,
aom_codec_flags_t flags, int ver);
/*!\brief Convenience macro for aom_codec_dec_init_ver()
*
* Ensures the ABI version parameter is properly set.
*/
#define aom_codec_dec_init(ctx, iface, cfg, flags) \
aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION)
/*!\brief Parse stream info from a buffer
*
* Performs high level parsing of the bitstream. Construction of a decoder
* context is not necessary. Can be used to determine if the bitstream is
* of the proper format, and to extract information from the stream.
*
* \param[in] iface Pointer to the algorithm interface
* \param[in] data Pointer to a block of data to parse
* \param[in] data_sz Size of the data buffer
* \param[in,out] si Pointer to stream info to update. The size member
* \ref MUST be properly initialized, but \ref MAY be
* clobbered by the algorithm. This parameter \ref MAY
* be NULL.
*
* \retval #AOM_CODEC_OK
* Bitstream is parsable and stream information updated
*/
aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
const uint8_t *data,
unsigned int data_sz,
aom_codec_stream_info_t *si);
/*!\brief Return information about the current stream.
*
* Returns information about the stream that has been parsed during decoding.
*
* \param[in] ctx Pointer to this instance's context
* \param[in,out] si Pointer to stream info to update. The size member
* \ref MUST be properly initialized, but \ref MAY be
* clobbered by the algorithm. This parameter \ref MAY
* be NULL.
*
* \retval #AOM_CODEC_OK
* Bitstream is parsable and stream information updated
*/
aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
aom_codec_stream_info_t *si);
/*!\brief Decode data
*
* Processes a buffer of coded data. If the processing results in a new
* decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
* generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
* time stamp) order. Frames produced will always be in PTS (presentation
* time stamp) order.
* If the decoder is configured with AOM_CODEC_USE_INPUT_FRAGMENTS enabled,
* data and data_sz can contain a fragment of the encoded frame. Fragment
* \#n must contain at least partition \#n, but can also contain subsequent
* partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
* be empty. When no more data is available, this function should be called
* with NULL as data and 0 as data_sz. The memory passed to this function
* must be available until the frame has been decoded.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] data Pointer to this block of new coded data. If
* NULL, a AOM_CODEC_CB_PUT_FRAME event is posted
* for the previously decoded frame.
* \param[in] data_sz Size of the coded data, in bytes.
* \param[in] user_priv Application specific data to associate with
* this frame.
* \param[in] deadline Soft deadline the decoder should attempt to meet,
* in us. Set to zero for unlimited.
*
* \return Returns #AOM_CODEC_OK if the coded data was processed completely
* and future pictures can be decoded without error. Otherwise,
* see the descriptions of the other error codes in ::aom_codec_err_t
* for recoverability capabilities.
*/
aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
unsigned int data_sz, void *user_priv,
long deadline);
/*!\brief Decoded frames iterator
*
* Iterates over a list of the frames available for display. The iterator
* storage should be initialized to NULL to start the iteration. Iteration is
* complete when this function returns NULL.
*
* The list of available frames becomes valid upon completion of the
* aom_codec_decode call, and remains valid until the next call to
* aom_codec_decode.
*
* \param[in] ctx Pointer to this instance's context
* \param[in,out] iter Iterator storage, initialized to NULL
*
* \return Returns a pointer to an image, if one is ready for display. Frames
* produced will always be in PTS (presentation time stamp) order.
*/
aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter);
/*!\defgroup cap_put_frame Frame-Based Decoding Functions
*
* The following functions are required to be implemented for all decoders
* that advertise the AOM_CODEC_CAP_PUT_FRAME capability. Calling these
* functions
* for codecs that don't advertise this capability will result in an error
* code being returned, usually AOM_CODEC_ERROR
* @{
*/
/*!\brief put frame callback prototype
*
* This callback is invoked by the decoder to notify the application of
* the availability of decoded image data.
*/
typedef void (*aom_codec_put_frame_cb_fn_t)(void *user_priv,
const aom_image_t *img);
/*!\brief Register for notification of frame completion.
*
* Registers a given function to be called when a decoded frame is
* available.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] cb Pointer to the callback function
* \param[in] user_priv User's private data
*
* \retval #AOM_CODEC_OK
* Callback successfully registered.
* \retval #AOM_CODEC_ERROR
* Decoder context not initialized, or algorithm not capable of
* posting slice completion.
*/
aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
aom_codec_put_frame_cb_fn_t cb,
void *user_priv);
/*!@} - end defgroup cap_put_frame */
/*!\defgroup cap_put_slice Slice-Based Decoding Functions
*
* The following functions are required to be implemented for all decoders
* that advertise the AOM_CODEC_CAP_PUT_SLICE capability. Calling these
* functions
* for codecs that don't advertise this capability will result in an error
* code being returned, usually AOM_CODEC_ERROR
* @{
*/
/*!\brief put slice callback prototype
*
* This callback is invoked by the decoder to notify the application of
* the availability of partially decoded image data. The
*/
typedef void (*aom_codec_put_slice_cb_fn_t)(void *user_priv,
const aom_image_t *img,
const aom_image_rect_t *valid,
const aom_image_rect_t *update);
/*!\brief Register for notification of slice completion.
*
* Registers a given function to be called when a decoded slice is
* available.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] cb Pointer to the callback function
* \param[in] user_priv User's private data
*
* \retval #AOM_CODEC_OK
* Callback successfully registered.
* \retval #AOM_CODEC_ERROR
* Decoder context not initialized, or algorithm not capable of
* posting slice completion.
*/
aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
aom_codec_put_slice_cb_fn_t cb,
void *user_priv);
/*!@} - end defgroup cap_put_slice*/
/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
*
* The following section is required to be implemented for all decoders
* that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
* Calling this function for codecs that don't advertise this capability
* will result in an error code being returned, usually AOM_CODEC_ERROR.
*
* \note
* Currently this only works with AV1.
* @{
*/
/*!\brief Pass in external frame buffers for the decoder to use.
*
* Registers functions to be called when libaom needs a frame buffer
* to decode the current frame and a function to be called when libaom does
* not internally reference the frame buffer. This set function must
* be called before the first call to decode or libaom will assume the
* default behavior of allocating frame buffers internally.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] cb_get Pointer to the get callback function
* \param[in] cb_release Pointer to the release callback function
* \param[in] cb_priv Callback's private data
*
* \retval #AOM_CODEC_OK
* External frame buffers will be used by libaom.
* \retval #AOM_CODEC_INVALID_PARAM
* One or more of the callbacks were NULL.
* \retval #AOM_CODEC_ERROR
* Decoder context not initialized, or algorithm not capable of
* using external frame buffers.
*
* \note
* When decoding AV1, the application may be required to pass in at least
* #AOM_MAXIMUM_WORK_BUFFERS external frame
* buffers.
*/
aom_codec_err_t aom_codec_set_frame_buffer_functions(
aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
/*!@} - end defgroup cap_external_frame_buffer */
/*!@} - end defgroup decoder*/
#ifdef __cplusplus
}
#endif
#endif // AOM_AOM_DECODER_H_

View File

@@ -1,837 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_ENCODER_H_
#define AOM_AOM_ENCODER_H_
/*!\defgroup encoder Encoder Algorithm Interface
* \ingroup codec
* This abstraction allows applications using this encoder to easily support
* multiple video formats with minimal code duplication. This section describes
* the interface common to all encoders.
* @{
*/
/*!\file
* \brief Describes the encoder algorithm interface to applications.
*
* This file describes the interface between an application and a
* video encoder algorithm.
*
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "./aom_codec.h"
/*!\brief Current ABI version number
*
* \internal
* If this file is altered in any way that changes the ABI, this value
* must be bumped. Examples include, but are not limited to, changing
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
#define AOM_ENCODER_ABI_VERSION \
(5 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
*
* Each encoder advertises the capabilities it supports as part of its
* ::aom_codec_iface_t interface structure. Capabilities are extra
* interfaces or functionality, and are not required to be supported
* by an encoder.
*
* The available flags are specified by AOM_CODEC_CAP_* defines.
*/
#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
/*! Can output one partition at a time. Each partition is returned in its
* own AOM_CODEC_CX_FRAME_PKT, with the FRAME_IS_FRAGMENT flag set for
* every partition but the last. In this mode all frames are always
* returned partition by partition.
*/
#define AOM_CODEC_CAP_OUTPUT_PARTITION 0x20000
/*! Can support input images at greater than 8 bitdepth.
*/
#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000
/*! \brief Initialization-time Feature Enabling
*
* Certain codec features must be known at initialization time, to allow
* for proper memory allocation.
*
* The available flags are specified by AOM_CODEC_USE_* defines.
*/
#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
/*!\brief Make the encoder output one partition at a time. */
#define AOM_CODEC_USE_OUTPUT_PARTITION 0x20000
#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
/*!\brief Generic fixed size buffer structure
*
* This structure is able to hold a reference to any fixed size buffer.
*/
typedef struct aom_fixed_buf {
void *buf; /**< Pointer to the data */
size_t sz; /**< Length of the buffer, in chars */
} aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
/*!\brief Time Stamp Type
*
* An integer, which when multiplied by the stream's time base, provides
* the absolute time of a sample.
*/
typedef int64_t aom_codec_pts_t;
/*!\brief Compressed Frame Flags
*
* This type represents a bitfield containing information about a compressed
* frame that may be useful to an application. The most significant 16 bits
* can be used by an algorithm to provide additional detail, for example to
* support frame types that are codec specific (MPEG-1 D-frames for example)
*/
typedef uint32_t aom_codec_frame_flags_t;
#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
/*!\brief frame can be dropped without affecting the stream (no future frame
* depends on this one) */
#define AOM_FRAME_IS_DROPPABLE 0x2
/*!\brief frame should be decoded but will not be shown */
#define AOM_FRAME_IS_INVISIBLE 0x4
/*!\brief this is a fragment of the encoded frame */
#define AOM_FRAME_IS_FRAGMENT 0x8
/*!\brief Error Resilient flags
*
* These flags define which error resilient features to enable in the
* encoder. The flags are specified through the
* aom_codec_enc_cfg::g_error_resilient variable.
*/
typedef uint32_t aom_codec_er_flags_t;
/*!\brief Improve resiliency against losses of whole frames */
#define AOM_ERROR_RESILIENT_DEFAULT 0x1
/*!\brief The frame partitions are independently decodable by the bool decoder,
* meaning that partitions can be decoded even though earlier partitions have
* been lost. Note that intra prediction is still done over the partition
* boundary. */
#define AOM_ERROR_RESILIENT_PARTITIONS 0x2
/*!\brief Encoder output packet variants
*
* This enumeration lists the different kinds of data packets that can be
* returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY
* extend this list to provide additional functionality.
*/
enum aom_codec_cx_pkt_kind {
AOM_CODEC_CX_FRAME_PKT, /**< Compressed video frame */
AOM_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */
AOM_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
AOM_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */
AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */
};
/*!\brief Encoder output packet
*
* This structure contains the different kinds of output data the encoder
* may produce while compressing a frame.
*/
typedef struct aom_codec_cx_pkt {
enum aom_codec_cx_pkt_kind kind; /**< packet variant */
union {
struct {
void *buf; /**< compressed data buffer */
size_t sz; /**< length of compressed data */
/*!\brief time stamp to show frame (in timebase units) */
aom_codec_pts_t pts;
/*!\brief duration to show frame (in timebase units) */
unsigned long duration;
aom_codec_frame_flags_t flags; /**< flags for this frame */
/*!\brief the partition id defines the decoding order of the partitions.
* Only applicable when "output partition" mode is enabled. First
* partition has id 0.*/
int partition_id;
} frame; /**< data for compressed frame packet */
aom_fixed_buf_t twopass_stats; /**< data for two-pass packet */
aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
struct aom_psnr_pkt {
unsigned int samples[4]; /**< Number of samples, total/y/u/v */
uint64_t sse[4]; /**< sum squared error, total/y/u/v */
double psnr[4]; /**< PSNR, total/y/u/v */
} psnr; /**< data for PSNR packet */
aom_fixed_buf_t raw; /**< data for arbitrary packets */
/* This packet size is fixed to allow codecs to extend this
* interface without having to manage storage for raw packets,
* i.e., if it's smaller than 128 bytes, you can store in the
* packet list directly.
*/
char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */
} data; /**< packet data */
} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
/*!\brief Rational Number
*
* This structure holds a fractional value.
*/
typedef struct aom_rational {
int num; /**< fraction numerator */
int den; /**< fraction denominator */
} aom_rational_t; /**< alias for struct aom_rational */
/*!\brief Multi-pass Encoding Pass */
enum aom_enc_pass {
AOM_RC_ONE_PASS, /**< Single pass mode */
AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */
AOM_RC_LAST_PASS /**< Final pass of multi-pass mode */
};
/*!\brief Rate control mode */
enum aom_rc_mode {
AOM_VBR, /**< Variable Bit Rate (VBR) mode */
AOM_CBR, /**< Constant Bit Rate (CBR) mode */
AOM_CQ, /**< Constrained Quality (CQ) mode */
AOM_Q, /**< Constant Quality (Q) mode */
};
/*!\brief Keyframe placement mode.
*
* This enumeration determines whether keyframes are placed automatically by
* the encoder or whether this behavior is disabled. Older releases of this
* SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled.
* This name is confusing for this behavior, so the new symbols to be used
* are AOM_KF_AUTO and AOM_KF_DISABLED.
*/
enum aom_kf_mode {
AOM_KF_FIXED, /**< deprecated, implies AOM_KF_DISABLED */
AOM_KF_AUTO, /**< Encoder determines optimal placement automatically */
AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
};
/*!\brief Encoded Frame Flags
*
* This type indicates a bitfield to be passed to aom_codec_encode(), defining
* per-frame boolean values. By convention, bits common to all codecs will be
* named AOM_EFLAG_*, and bits specific to an algorithm will be named
* /algo/_eflag_*. The lower order 16 bits are reserved for common use.
*/
typedef long aom_enc_frame_flags_t;
#define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
/*!\brief Encoder configuration structure
*
* This structure contains the encoder settings that have common representations
* across all codecs. This doesn't imply that all codecs support all features,
* however.
*/
typedef struct aom_codec_enc_cfg {
/*
* generic settings (g)
*/
/*!\brief Algorithm specific "usage" value
*
* Algorithms may define multiple values for usage, which may convey the
* intent of how the application intends to use the stream. If this value
* is non-zero, consult the documentation for the codec to determine its
* meaning.
*/
unsigned int g_usage;
/*!\brief Maximum number of threads to use
*
* For multi-threaded implementations, use no more than this number of
* threads. The codec may use fewer threads than allowed. The value
* 0 is equivalent to the value 1.
*/
unsigned int g_threads;
/*!\brief Bitstream profile to use
*
* Some codecs support a notion of multiple bitstream profiles. Typically
* this maps to a set of features that are turned on or off. Often the
* profile to use is determined by the features of the intended decoder.
* Consult the documentation for the codec to determine the valid values
* for this parameter, or set to zero for a sane default.
*/
unsigned int g_profile; /**< profile of bitstream to use */
/*!\brief Width of the frame
*
* This value identifies the presentation resolution of the frame,
* in pixels. Note that the frames passed as input to the encoder must
* have this resolution. Frames will be presented by the decoder in this
* resolution, independent of any spatial resampling the encoder may do.
*/
unsigned int g_w;
/*!\brief Height of the frame
*
* This value identifies the presentation resolution of the frame,
* in pixels. Note that the frames passed as input to the encoder must
* have this resolution. Frames will be presented by the decoder in this
* resolution, independent of any spatial resampling the encoder may do.
*/
unsigned int g_h;
/*!\brief Bit-depth of the codec
*
* This value identifies the bit_depth of the codec,
* Only certain bit-depths are supported as identified in the
* aom_bit_depth_t enum.
*/
aom_bit_depth_t g_bit_depth;
/*!\brief Bit-depth of the input frames
*
* This value identifies the bit_depth of the input frames in bits.
* Note that the frames passed as input to the encoder must have
* this bit-depth.
*/
unsigned int g_input_bit_depth;
/*!\brief Stream timebase units
*
* Indicates the smallest interval of time, in seconds, used by the stream.
* For fixed frame rate material, or variable frame rate material where
* frames are timed at a multiple of a given clock (ex: video capture),
* the \ref RECOMMENDED method is to set the timebase to the reciprocal
* of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
* pts to correspond to the frame number, which can be handy. For
* re-encoding video from containers with absolute time timestamps, the
* \ref RECOMMENDED method is to set the timebase to that of the parent
* container or multimedia framework (ex: 1/1000 for ms, as in FLV).
*/
struct aom_rational g_timebase;
/*!\brief Enable error resilient modes.
*
* The error resilient bitfield indicates to the encoder which features
* it should enable to take measures for streaming over lossy or noisy
* links.
*/
aom_codec_er_flags_t g_error_resilient;
/*!\brief Multi-pass Encoding Mode
*
* This value should be set to the current phase for multi-pass encoding.
* For single pass, set to #AOM_RC_ONE_PASS.
*/
enum aom_enc_pass g_pass;
/*!\brief Allow lagged encoding
*
* If set, this value allows the encoder to consume a number of input
* frames before producing output frames. This allows the encoder to
* base decisions for the current frame on future frames. This does
* increase the latency of the encoding pipeline, so it is not appropriate
* in all situations (ex: realtime encoding).
*
* Note that this is a maximum value -- the encoder may produce frames
* sooner than the given limit. Set this value to 0 to disable this
* feature.
*/
unsigned int g_lag_in_frames;
/*
* rate control settings (rc)
*/
/*!\brief Temporal resampling configuration, if supported by the codec.
*
* Temporal resampling allows the codec to "drop" frames as a strategy to
* meet its target data rate. This can cause temporal discontinuities in
* the encoded video, which may appear as stuttering during playback. This
* trade-off is often acceptable, but for many applications is not. It can
* be disabled in these cases.
*
* Note that not all codecs support this feature. All aom AVx codecs do.
* For other codecs, consult the documentation for that algorithm.
*
* This threshold is described as a percentage of the target data buffer.
* When the data buffer falls below this percentage of fullness, a
* dropped frame is indicated. Set the threshold to zero (0) to disable
* this feature.
*/
unsigned int rc_dropframe_thresh;
/*!\brief Enable/disable spatial resampling, if supported by the codec.
*
* Spatial resampling allows the codec to compress a lower resolution
* version of the frame, which is then upscaled by the encoder to the
* correct presentation resolution. This increases visual quality at
* low data rates, at the expense of CPU time on the encoder/decoder.
*/
unsigned int rc_resize_allowed;
/*!\brief Internal coded frame width.
*
* If spatial resampling is enabled this specifies the width of the
* encoded frame.
*/
unsigned int rc_scaled_width;
/*!\brief Internal coded frame height.
*
* If spatial resampling is enabled this specifies the height of the
* encoded frame.
*/
unsigned int rc_scaled_height;
/*!\brief Spatial resampling up watermark.
*
* This threshold is described as a percentage of the target data buffer.
* When the data buffer rises above this percentage of fullness, the
* encoder will step up to a higher resolution version of the frame.
*/
unsigned int rc_resize_up_thresh;
/*!\brief Spatial resampling down watermark.
*
* This threshold is described as a percentage of the target data buffer.
* When the data buffer falls below this percentage of fullness, the
* encoder will step down to a lower resolution version of the frame.
*/
unsigned int rc_resize_down_thresh;
/*!\brief Rate control algorithm to use.
*
* Indicates whether the end usage of this stream is to be streamed over
* a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
* mode should be used, or whether it will be played back on a high
* bandwidth link, as from a local disk, where higher variations in
* bitrate are acceptable.
*/
enum aom_rc_mode rc_end_usage;
/*!\brief Two-pass stats buffer.
*
* A buffer containing all of the stats packets produced in the first
* pass, concatenated.
*/
aom_fixed_buf_t rc_twopass_stats_in;
/*!\brief first pass mb stats buffer.
*
* A buffer containing all of the first pass mb stats packets produced
* in the first pass, concatenated.
*/
aom_fixed_buf_t rc_firstpass_mb_stats_in;
/*!\brief Target data rate
*
* Target bandwidth to use for this stream, in kilobits per second.
*/
unsigned int rc_target_bitrate;
/*
* quantizer settings
*/
/*!\brief Minimum (Best Quality) Quantizer
*
* The quantizer is the most direct control over the quality of the
* encoded image. The range of valid values for the quantizer is codec
* specific. Consult the documentation for the codec to determine the
* values to use. To determine the range programmatically, call
* aom_codec_enc_config_default() with a usage value of 0.
*/
unsigned int rc_min_quantizer;
/*!\brief Maximum (Worst Quality) Quantizer
*
* The quantizer is the most direct control over the quality of the
* encoded image. The range of valid values for the quantizer is codec
* specific. Consult the documentation for the codec to determine the
* values to use. To determine the range programmatically, call
* aom_codec_enc_config_default() with a usage value of 0.
*/
unsigned int rc_max_quantizer;
/*
* bitrate tolerance
*/
/*!\brief Rate control adaptation undershoot control
*
* This value, expressed as a percentage of the target bitrate,
* controls the maximum allowed adaptation speed of the codec.
* This factor controls the maximum amount of bits that can
* be subtracted from the target bitrate in order to compensate
* for prior overshoot.
*
* Valid values in the range 0-1000.
*/
unsigned int rc_undershoot_pct;
/*!\brief Rate control adaptation overshoot control
*
* This value, expressed as a percentage of the target bitrate,
* controls the maximum allowed adaptation speed of the codec.
* This factor controls the maximum amount of bits that can
* be added to the target bitrate in order to compensate for
* prior undershoot.
*
* Valid values in the range 0-1000.
*/
unsigned int rc_overshoot_pct;
/*
* decoder buffer model parameters
*/
/*!\brief Decoder Buffer Size
*
* This value indicates the amount of data that may be buffered by the
* decoding application. Note that this value is expressed in units of
* time (milliseconds). For example, a value of 5000 indicates that the
* client will buffer (at least) 5000ms worth of encoded data. Use the
* target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
* necessary.
*/
unsigned int rc_buf_sz;
/*!\brief Decoder Buffer Initial Size
*
* This value indicates the amount of data that will be buffered by the
* decoding application prior to beginning playback. This value is
* expressed in units of time (milliseconds). Use the target bitrate
* (#rc_target_bitrate) to convert to bits/bytes, if necessary.
*/
unsigned int rc_buf_initial_sz;
/*!\brief Decoder Buffer Optimal Size
*
* This value indicates the amount of data that the encoder should try
* to maintain in the decoder's buffer. This value is expressed in units
* of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
* to convert to bits/bytes, if necessary.
*/
unsigned int rc_buf_optimal_sz;
/*
* 2 pass rate control parameters
*/
/*!\brief Two-pass mode CBR/VBR bias
*
* Bias, expressed on a scale of 0 to 100, for determining target size
* for the current frame. The value 0 indicates the optimal CBR mode
* value should be used. The value 100 indicates the optimal VBR mode
* value should be used. Values in between indicate which way the
* encoder should "lean."
*/
unsigned int rc_2pass_vbr_bias_pct;
/*!\brief Two-pass mode per-GOP minimum bitrate
*
* This value, expressed as a percentage of the target bitrate, indicates
* the minimum bitrate to be used for a single GOP (aka "section")
*/
unsigned int rc_2pass_vbr_minsection_pct;
/*!\brief Two-pass mode per-GOP maximum bitrate
*
* This value, expressed as a percentage of the target bitrate, indicates
* the maximum bitrate to be used for a single GOP (aka "section")
*/
unsigned int rc_2pass_vbr_maxsection_pct;
/*
* keyframing settings (kf)
*/
/*!\brief Keyframe placement mode
*
* This value indicates whether the encoder should place keyframes at a
* fixed interval, or determine the optimal placement automatically
* (as governed by the #kf_min_dist and #kf_max_dist parameters)
*/
enum aom_kf_mode kf_mode;
/*!\brief Keyframe minimum interval
*
* This value, expressed as a number of frames, prevents the encoder from
* placing a keyframe nearer than kf_min_dist to the previous keyframe. At
* least kf_min_dist frames non-keyframes will be coded before the next
* keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
*/
unsigned int kf_min_dist;
/*!\brief Keyframe maximum interval
*
* This value, expressed as a number of frames, forces the encoder to code
* a keyframe if one has not been coded in the last kf_max_dist frames.
* A value of 0 implies all frames will be keyframes. Set kf_min_dist
* equal to kf_max_dist for a fixed interval.
*/
unsigned int kf_max_dist;
} aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */
/*!\brief Initialize an encoder instance
*
* Initializes a encoder context using the given interface. Applications
* should call the aom_codec_enc_init convenience macro instead of this
* function directly, to ensure that the ABI version number parameter
* is properly initialized.
*
* If the library was configured with --disable-multithread, this call
* is not thread safe and should be guarded with a lock if being used
* in a multithreaded context.
*
* \param[in] ctx Pointer to this instance's context.
* \param[in] iface Pointer to the algorithm interface to use.
* \param[in] cfg Configuration to use, if known. May be NULL.
* \param[in] flags Bitfield of AOM_CODEC_USE_* flags
* \param[in] ver ABI version number. Must be set to
* AOM_ENCODER_ABI_VERSION
* \retval #AOM_CODEC_OK
* The decoder algorithm initialized.
* \retval #AOM_CODEC_MEM_ERROR
* Memory allocation failed.
*/
aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
aom_codec_iface_t *iface,
const aom_codec_enc_cfg_t *cfg,
aom_codec_flags_t flags, int ver);
/*!\brief Convenience macro for aom_codec_enc_init_ver()
*
* Ensures the ABI version parameter is properly set.
*/
#define aom_codec_enc_init(ctx, iface, cfg, flags) \
aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION)
/*!\brief Initialize multi-encoder instance
*
* Initializes multi-encoder context using the given interface.
* Applications should call the aom_codec_enc_init_multi convenience macro
* instead of this function directly, to ensure that the ABI version number
* parameter is properly initialized.
*
* \param[in] ctx Pointer to this instance's context.
* \param[in] iface Pointer to the algorithm interface to use.
* \param[in] cfg Configuration to use, if known. May be NULL.
* \param[in] num_enc Total number of encoders.
* \param[in] flags Bitfield of AOM_CODEC_USE_* flags
* \param[in] dsf Pointer to down-sampling factors.
* \param[in] ver ABI version number. Must be set to
* AOM_ENCODER_ABI_VERSION
* \retval #AOM_CODEC_OK
* The decoder algorithm initialized.
* \retval #AOM_CODEC_MEM_ERROR
* Memory allocation failed.
*/
aom_codec_err_t aom_codec_enc_init_multi_ver(
aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver);
/*!\brief Convenience macro for aom_codec_enc_init_multi_ver()
*
* Ensures the ABI version parameter is properly set.
*/
#define aom_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
aom_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \
AOM_ENCODER_ABI_VERSION)
/*!\brief Get a default configuration
*
* Initializes a encoder configuration structure with default values. Supports
* the notion of "usages" so that an algorithm may offer different default
* settings depending on the user's intended goal. This function \ref SHOULD
* be called by all applications to initialize the configuration structure
* before specializing the configuration with application specific values.
*
* \param[in] iface Pointer to the algorithm interface to use.
* \param[out] cfg Configuration buffer to populate.
* \param[in] reserved Must set to 0 for VP8 and AV1.
*
* \retval #AOM_CODEC_OK
* The configuration was populated.
* \retval #AOM_CODEC_INCAPABLE
* Interface is not an encoder interface.
* \retval #AOM_CODEC_INVALID_PARAM
* A parameter was NULL, or the usage value was not recognized.
*/
aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
aom_codec_enc_cfg_t *cfg,
unsigned int reserved);
/*!\brief Set or change configuration
*
* Reconfigures an encoder instance according to the given configuration.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] cfg Configuration buffer to use
*
* \retval #AOM_CODEC_OK
* The configuration was populated.
* \retval #AOM_CODEC_INCAPABLE
* Interface is not an encoder interface.
* \retval #AOM_CODEC_INVALID_PARAM
* A parameter was NULL, or the usage value was not recognized.
*/
aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
const aom_codec_enc_cfg_t *cfg);
/*!\brief Get global stream headers
*
* Retrieves a stream level global header packet, if supported by the codec.
*
* \param[in] ctx Pointer to this instance's context
*
* \retval NULL
* Encoder does not support global header
* \retval Non-NULL
* Pointer to buffer containing global header packet
*/
aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
/*!\brief deadline parameter analogous to AVx REALTIME mode. */
#define AOM_DL_REALTIME (1)
/*!\brief deadline parameter analogous to AVx GOOD QUALITY mode. */
#define AOM_DL_GOOD_QUALITY (1000000)
/*!\brief deadline parameter analogous to AVx BEST QUALITY mode. */
#define AOM_DL_BEST_QUALITY (0)
/*!\brief Encode a frame
*
* Encodes a video frame at the given "presentation time." The presentation
* time stamp (PTS) \ref MUST be strictly increasing.
*
* The encoder supports the notion of a soft real-time deadline. Given a
* non-zero value to the deadline parameter, the encoder will make a "best
* effort" guarantee to return before the given time slice expires. It is
* implicit that limiting the available time to encode will degrade the
* output quality. The encoder can be given an unlimited time to produce the
* best possible frame by specifying a deadline of '0'. This deadline
* supercedes the AVx notion of "best quality, good quality, realtime".
* Applications that wish to map these former settings to the new deadline
* based system can use the symbols #AOM_DL_REALTIME, #AOM_DL_GOOD_QUALITY,
* and #AOM_DL_BEST_QUALITY.
*
* When the last frame has been passed to the encoder, this function should
* continue to be called, with the img parameter set to NULL. This will
* signal the end-of-stream condition to the encoder and allow it to encode
* any held buffers. Encoding is complete when aom_codec_encode() is called
* and aom_codec_get_cx_data() returns no data.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] img Image data to encode, NULL to flush.
* \param[in] pts Presentation time stamp, in timebase units.
* \param[in] duration Duration to show frame, in timebase units.
* \param[in] flags Flags to use for encoding this frame.
* \param[in] deadline Time to spend encoding, in microseconds. (0=infinite)
*
* \retval #AOM_CODEC_OK
* The configuration was populated.
* \retval #AOM_CODEC_INCAPABLE
* Interface is not an encoder interface.
* \retval #AOM_CODEC_INVALID_PARAM
* A parameter was NULL, the image format is unsupported, etc.
*/
aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
aom_codec_pts_t pts, unsigned long duration,
aom_enc_frame_flags_t flags,
unsigned long deadline);
/*!\brief Set compressed data output buffer
*
* Sets the buffer that the codec should output the compressed data
* into. This call effectively sets the buffer pointer returned in the
* next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
* appended into this buffer. The buffer is preserved across frames,
* so applications must periodically call this function after flushing
* the accumulated compressed data to disk or to the network to reset
* the pointer to the buffer's head.
*
* `pad_before` bytes will be skipped before writing the compressed
* data, and `pad_after` bytes will be appended to the packet. The size
* of the packet will be the sum of the size of the actual compressed
* data, pad_before, and pad_after. The padding bytes will be preserved
* (not overwritten).
*
* Note that calling this function does not guarantee that the returned
* compressed data will be placed into the specified buffer. In the
* event that the encoded data will not fit into the buffer provided,
* the returned packet \ref MAY point to an internal buffer, as it would
* if this call were never used. In this event, the output packet will
* NOT have any padding, and the application must free space and copy it
* to the proper place. This is of particular note in configurations
* that may output multiple packets for a single encoded frame (e.g., lagged
* encoding) or if the application does not reset the buffer periodically.
*
* Applications may restore the default behavior of the codec providing
* the compressed data buffer by calling this function with a NULL
* buffer.
*
* Applications \ref MUSTNOT call this function during iteration of
* aom_codec_get_cx_data().
*
* \param[in] ctx Pointer to this instance's context
* \param[in] buf Buffer to store compressed data into
* \param[in] pad_before Bytes to skip before writing compressed data
* \param[in] pad_after Bytes to skip after writing compressed data
*
* \retval #AOM_CODEC_OK
* The buffer was set successfully.
* \retval #AOM_CODEC_INVALID_PARAM
* A parameter was NULL, the image format is unsupported, etc.
*/
aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
const aom_fixed_buf_t *buf,
unsigned int pad_before,
unsigned int pad_after);
/*!\brief Encoded data iterator
*
* Iterates over a list of data packets to be passed from the encoder to the
* application. The different kinds of packets available are enumerated in
* #aom_codec_cx_pkt_kind.
*
* #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's
* muxer. Multiple compressed frames may be in the list.
* #AOM_CODEC_STATS_PKT packets should be appended to a global buffer.
*
* The application \ref MUST silently ignore any packet kinds that it does
* not recognize or support.
*
* The data buffers returned from this function are only guaranteed to be
* valid until the application makes another call to any aom_codec_* function.
*
* \param[in] ctx Pointer to this instance's context
* \param[in,out] iter Iterator storage, initialized to NULL
*
* \return Returns a pointer to an output data packet (compressed frame data,
* two-pass statistics, etc.) or NULL to signal end-of-list.
*
*/
const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
aom_codec_iter_t *iter);
/*!\brief Get Preview Frame
*
* Returns an image that can be used as a preview. Shows the image as it would
* exist at the decompressor. The application \ref MUST NOT write into this
* image buffer.
*
* \param[in] ctx Pointer to this instance's context
*
* \return Returns a pointer to a preview image, or NULL if no image is
* available.
*
*/
const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx);
/*!@} - end defgroup encoder*/
#ifdef __cplusplus
}
#endif
#endif // AOM_AOM_ENCODER_H_

View File

@@ -1,84 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_FRAME_BUFFER_H_
#define AOM_AOM_FRAME_BUFFER_H_
/*!\file
* \brief Describes the decoder external frame buffer interface.
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "./aom_integer.h"
/*!\brief The maximum number of work buffers used by libaom.
* Support maximum 4 threads to decode video in parallel.
* Each thread will use one work buffer.
* TODO(hkuang): Add support to set number of worker threads dynamically.
*/
#define AOM_MAXIMUM_WORK_BUFFERS 8
/*!\brief The maximum number of reference buffers that a AV1 encoder may use.
*/
#define AOM_MAXIMUM_REF_BUFFERS 8
/*!\brief External frame buffer
*
* This structure holds allocated frame buffers used by the decoder.
*/
typedef struct aom_codec_frame_buffer {
uint8_t *data; /**< Pointer to the data buffer */
size_t size; /**< Size of data in bytes */
void *priv; /**< Frame's private data */
} aom_codec_frame_buffer_t;
/*!\brief get frame buffer callback prototype
*
* This callback is invoked by the decoder to retrieve data for the frame
* buffer in order for the decode call to complete. The callback must
* allocate at least min_size in bytes and assign it to fb->data. The callback
* must zero out all the data allocated. Then the callback must set fb->size
* to the allocated size. The application does not need to align the allocated
* data. The callback is triggered when the decoder needs a frame buffer to
* decode a compressed image into. This function may be called more than once
* for every call to aom_codec_decode. The application may set fb->priv to
* some data which will be passed back in the ximage and the release function
* call. |fb| is guaranteed to not be NULL. On success the callback must
* return 0. Any failure the callback must return a value less than 0.
*
* \param[in] priv Callback's private data
* \param[in] new_size Size in bytes needed by the buffer
* \param[in,out] fb Pointer to aom_codec_frame_buffer_t
*/
typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
aom_codec_frame_buffer_t *fb);
/*!\brief release frame buffer callback prototype
*
* This callback is invoked by the decoder when the frame buffer is not
* referenced by any other buffers. |fb| is guaranteed to not be NULL. On
* success the callback must return 0. Any failure the callback must return
* a value less than 0.
*
* \param[in] priv Callback's private data
* \param[in] fb Pointer to aom_codec_frame_buffer_t
*/
typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv,
aom_codec_frame_buffer_t *fb);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_AOM_FRAME_BUFFER_H_

View File

@@ -1,225 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\file
* \brief Describes the aom image descriptor and associated operations
*
*/
#ifndef AOM_AOM_IMAGE_H_
#define AOM_AOM_IMAGE_H_
#ifdef __cplusplus
extern "C" {
#endif
/*!\brief Current ABI version number
*
* \internal
* If this file is altered in any way that changes the ABI, this value
* must be bumped. Examples include, but are not limited to, changing
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
#define AOM_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
#define AOM_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */
#define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */
#define AOM_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */
#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
/*!\brief List of supported image formats */
typedef enum aom_img_fmt {
AOM_IMG_FMT_NONE,
AOM_IMG_FMT_RGB24, /**< 24 bit per pixel packed RGB */
AOM_IMG_FMT_RGB32, /**< 32 bit per pixel packed 0RGB */
AOM_IMG_FMT_RGB565, /**< 16 bit per pixel, 565 */
AOM_IMG_FMT_RGB555, /**< 16 bit per pixel, 555 */
AOM_IMG_FMT_UYVY, /**< UYVY packed YUV */
AOM_IMG_FMT_YUY2, /**< YUYV packed YUV */
AOM_IMG_FMT_YVYU, /**< YVYU packed YUV */
AOM_IMG_FMT_BGR24, /**< 24 bit per pixel packed BGR */
AOM_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */
AOM_IMG_FMT_ARGB, /**< 32 bit packed ARGB, alpha=255 */
AOM_IMG_FMT_ARGB_LE, /**< 32 bit packed BGRA, alpha=255 */
AOM_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */
AOM_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */
AOM_IMG_FMT_YV12 =
AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2,
AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP |
3, /** < planar 4:2:0 format with aom color space */
AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
AOM_IMG_FMT_I440 = AOM_IMG_FMT_PLANAR | 7,
AOM_IMG_FMT_444A = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_HAS_ALPHA | 6,
AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH,
AOM_IMG_FMT_I44016 = AOM_IMG_FMT_I440 | AOM_IMG_FMT_HIGHBITDEPTH
} aom_img_fmt_t; /**< alias for enum aom_img_fmt */
/*!\brief List of supported color spaces */
typedef enum aom_color_space {
AOM_CS_UNKNOWN = 0, /**< Unknown */
AOM_CS_BT_601 = 1, /**< BT.601 */
AOM_CS_BT_709 = 2, /**< BT.709 */
AOM_CS_SMPTE_170 = 3, /**< SMPTE.170 */
AOM_CS_SMPTE_240 = 4, /**< SMPTE.240 */
AOM_CS_BT_2020 = 5, /**< BT.2020 */
AOM_CS_RESERVED = 6, /**< Reserved */
AOM_CS_SRGB = 7 /**< sRGB */
} aom_color_space_t; /**< alias for enum aom_color_space */
/*!\brief List of supported color range */
typedef enum aom_color_range {
AOM_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
AOM_CR_FULL_RANGE = 1 /**< YUV/RGB [0..255] */
} aom_color_range_t; /**< alias for enum aom_color_range */
/**\brief Image Descriptor */
typedef struct aom_image {
aom_img_fmt_t fmt; /**< Image Format */
aom_color_space_t cs; /**< Color Space */
aom_color_range_t range; /**< Color Range */
/* Image storage dimensions */
unsigned int w; /**< Stored image width */
unsigned int h; /**< Stored image height */
unsigned int bit_depth; /**< Stored image bit-depth */
/* Image display dimensions */
unsigned int d_w; /**< Displayed image width */
unsigned int d_h; /**< Displayed image height */
/* Image intended rendering dimensions */
unsigned int r_w; /**< Intended rendering image width */
unsigned int r_h; /**< Intended rendering image height */
/* Chroma subsampling info */
unsigned int x_chroma_shift; /**< subsampling order, X */
unsigned int y_chroma_shift; /**< subsampling order, Y */
/* Image data pointers. */
#define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */
#define AOM_PLANE_Y 0 /**< Y (Luminance) plane */
#define AOM_PLANE_U 1 /**< U (Chroma) plane */
#define AOM_PLANE_V 2 /**< V (Chroma) plane */
#define AOM_PLANE_ALPHA 3 /**< A (Transparency) plane */
unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */
int stride[4]; /**< stride between rows for each plane */
int bps; /**< bits per sample (for packed formats) */
/*!\brief The following member may be set by the application to associate
* data with this image.
*/
void *user_priv;
/* The following members should be treated as private. */
unsigned char *img_data; /**< private */
int img_data_owner; /**< private */
int self_allocd; /**< private */
void *fb_priv; /**< Frame buffer data associated with the image. */
} aom_image_t; /**< alias for struct aom_image */
/**\brief Representation of a rectangle on a surface */
typedef struct aom_image_rect {
unsigned int x; /**< leftmost column */
unsigned int y; /**< topmost row */
unsigned int w; /**< width */
unsigned int h; /**< height */
} aom_image_rect_t; /**< alias for struct aom_image_rect */
/*!\brief Open a descriptor, allocating storage for the underlying image
*
* Returns a descriptor for storing an image of the given format. The
* storage for the descriptor is allocated on the heap.
*
* \param[in] img Pointer to storage for descriptor. If this parameter
* is NULL, the storage for the descriptor will be
* allocated on the heap.
* \param[in] fmt Format for the image
* \param[in] d_w Width of the image
* \param[in] d_h Height of the image
* \param[in] align Alignment, in bytes, of the image buffer and
* each row in the image(stride).
*
* \return Returns a pointer to the initialized image descriptor. If the img
* parameter is non-null, the value of the img parameter will be
* returned.
*/
aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
unsigned int d_w, unsigned int d_h,
unsigned int align);
/*!\brief Open a descriptor, using existing storage for the underlying image
*
* Returns a descriptor for storing an image of the given format. The
* storage for descriptor has been allocated elsewhere, and a descriptor is
* desired to "wrap" that storage.
*
* \param[in] img Pointer to storage for descriptor. If this parameter
* is NULL, the storage for the descriptor will be
* allocated on the heap.
* \param[in] fmt Format for the image
* \param[in] d_w Width of the image
* \param[in] d_h Height of the image
* \param[in] align Alignment, in bytes, of each row in the image.
* \param[in] img_data Storage to use for the image
*
* \return Returns a pointer to the initialized image descriptor. If the img
* parameter is non-null, the value of the img parameter will be
* returned.
*/
aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
unsigned int d_h, unsigned int align,
unsigned char *img_data);
/*!\brief Set the rectangle identifying the displayed portion of the image
*
* Updates the displayed rectangle (aka viewport) on the image surface to
* match the specified coordinates and size.
*
* \param[in] img Image descriptor
* \param[in] x leftmost column
* \param[in] y topmost row
* \param[in] w width
* \param[in] h height
*
* \return 0 if the requested rectangle is valid, nonzero otherwise.
*/
int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
unsigned int w, unsigned int h);
/*!\brief Flip the image vertically (top for bottom)
*
* Adjusts the image descriptor's pointers and strides to make the image
* be referenced upside-down.
*
* \param[in] img Image descriptor
*/
void aom_img_flip(aom_image_t *img);
/*!\brief Close an image descriptor
*
* Frees all allocated storage associated with an image descriptor.
*
* \param[in] img Image descriptor
*/
void aom_img_free(aom_image_t *img);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_AOM_IMAGE_H_

View File

@@ -1,64 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_INTEGER_H_
#define AOM_AOM_INTEGER_H_
/* get ptrdiff_t, size_t, wchar_t, NULL */
#include <stddef.h>
#if defined(_MSC_VER)
#define AOM_FORCE_INLINE __forceinline
#define AOM_INLINE __inline
#else
#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
// TODO(jbb): Allow a way to force inline off for older compilers.
#define AOM_INLINE inline
#endif
#if defined(AOM_EMULATE_INTTYPES)
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
#ifndef _UINTPTR_T_DEFINED
typedef size_t uintptr_t;
#endif
#else
/* Most platforms have the C99 standard integer types. */
#if defined(__cplusplus)
#if !defined(__STDC_FORMAT_MACROS)
#define __STDC_FORMAT_MACROS
#endif
#if !defined(__STDC_LIMIT_MACROS)
#define __STDC_LIMIT_MACROS
#endif
#endif // __cplusplus
#include <stdint.h>
#endif
/* VS2010 defines stdint.h, but not inttypes.h */
#if defined(_MSC_VER) && _MSC_VER < 1800
#define PRId64 "I64d"
#else
#include <inttypes.h>
#endif
#endif // AOM_AOM_INTEGER_H_

View File

@@ -1,759 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOMCX_H_
#define AOM_AOMCX_H_
/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
* \ingroup aom
*
* @{
*/
#include "./aom.h"
#include "./aom_encoder.h"
/*!\file
* \brief Provides definitions for using AOM or AV1 encoder algorithm within the
* aom Codec Interface.
*/
#ifdef __cplusplus
extern "C" {
#endif
/*!\name Algorithm interface for AV1
*
* This interface provides the capability to encode raw AV1 streams.
* @{
*/
extern aom_codec_iface_t aom_codec_av1_cx_algo;
extern aom_codec_iface_t *aom_codec_av1_cx(void);
/*!@} - end algorithm interface member group*/
/*
* Algorithm Flags
*/
/*!\brief Don't reference the last frame
*
* When this flag is set, the encoder will not use the last frame as a
* predictor. When not set, the encoder will choose whether to use the
* last frame or not automatically.
*/
#define AOM_EFLAG_NO_REF_LAST (1 << 16)
/*!\brief Don't reference the golden frame
*
* When this flag is set, the encoder will not use the golden frame as a
* predictor. When not set, the encoder will choose whether to use the
* golden frame or not automatically.
*/
#define AOM_EFLAG_NO_REF_GF (1 << 17)
/*!\brief Don't reference the alternate reference frame
*
* When this flag is set, the encoder will not use the alt ref frame as a
* predictor. When not set, the encoder will choose whether to use the
* alt ref frame or not automatically.
*/
#define AOM_EFLAG_NO_REF_ARF (1 << 21)
/*!\brief Don't update the last frame
*
* When this flag is set, the encoder will not update the last frame with
* the contents of the current frame.
*/
#define AOM_EFLAG_NO_UPD_LAST (1 << 18)
/*!\brief Don't update the golden frame
*
* When this flag is set, the encoder will not update the golden frame with
* the contents of the current frame.
*/
#define AOM_EFLAG_NO_UPD_GF (1 << 22)
/*!\brief Don't update the alternate reference frame
*
* When this flag is set, the encoder will not update the alt ref frame with
* the contents of the current frame.
*/
#define AOM_EFLAG_NO_UPD_ARF (1 << 23)
/*!\brief Force golden frame update
*
* When this flag is set, the encoder copy the contents of the current frame
* to the golden frame buffer.
*/
#define AOM_EFLAG_FORCE_GF (1 << 19)
/*!\brief Force alternate reference frame update
*
* When this flag is set, the encoder copy the contents of the current frame
* to the alternate reference frame buffer.
*/
#define AOM_EFLAG_FORCE_ARF (1 << 24)
/*!\brief Disable entropy update
*
* When this flag is set, the encoder will not update its internal entropy
* model based on the entropy of this frame.
*/
#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 20)
/*!\brief AVx encoder control functions
*
* This set of macros define the control functions available for AVx
* encoder interface.
*
* \sa #aom_codec_control
*/
enum aome_enc_control_id {
/*!\brief Codec control function to set which reference frame encoder can use.
*
* Supported in codecs: VP8, AV1
*/
AOME_USE_REFERENCE = 7,
/*!\brief Codec control function to pass an ROI map to encoder.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_ROI_MAP = 8,
/*!\brief Codec control function to pass an Active map to encoder.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_ACTIVEMAP,
/*!\brief Codec control function to set encoder scaling mode.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_SCALEMODE = 11,
/*!\brief Codec control function to set encoder internal speed settings.
*
* Changes in this value influences, among others, the encoder's selection
* of motion estimation methods. Values greater than 0 will increase encoder
* speed at the expense of quality.
*
* \note Valid range for VP8: -16..16
* \note Valid range for AV1: -8..8
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_CPUUSED = 13,
/*!\brief Codec control function to enable automatic set and use alf frames.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_ENABLEAUTOALTREF,
#if CONFIG_EXT_REFS
/*!\brief Codec control function to enable automatic set and use
* bwd-pred frames.
*
* Supported in codecs: AV1
*/
AOME_SET_ENABLEAUTOBWDREF,
#endif // CONFIG_EXT_REFS
/*!\brief control function to set noise sensitivity
*
* 0: off, 1: OnYOnly, 2: OnYUV,
* 3: OnYUVAggressive, 4: Adaptive
*
* Supported in codecs: VP8
*/
AOME_SET_NOISE_SENSITIVITY,
/*!\brief Codec control function to set sharpness.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_SHARPNESS,
/*!\brief Codec control function to set the threshold for MBs treated static.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_STATIC_THRESHOLD,
/*!\brief Codec control function to set the number of token partitions.
*
* Supported in codecs: VP8
*/
AOME_SET_TOKEN_PARTITIONS,
/*!\brief Codec control function to get last quantizer chosen by the encoder.
*
* Return value uses internal quantizer scale defined by the codec.
*
* Supported in codecs: VP8, AV1
*/
AOME_GET_LAST_QUANTIZER,
/*!\brief Codec control function to get last quantizer chosen by the encoder.
*
* Return value uses the 0..63 scale as used by the rc_*_quantizer config
* parameters.
*
* Supported in codecs: VP8, AV1
*/
AOME_GET_LAST_QUANTIZER_64,
/*!\brief Codec control function to set the max no of frames to create arf.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_ARNR_MAXFRAMES,
/*!\brief Codec control function to set the filter strength for the arf.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_ARNR_STRENGTH,
/*!\deprecated control function to set the filter type to use for the arf. */
AOME_SET_ARNR_TYPE,
/*!\brief Codec control function to set visual tuning.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_TUNING,
/*!\brief Codec control function to set constrained quality level.
*
* \attention For this value to be used aom_codec_enc_cfg_t::g_usage must be
* set to #AOM_CQ.
* \note Valid range: 0..63
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_CQ_LEVEL,
/*!\brief Codec control function to set Max data rate for Intra frames.
*
* This value controls additional clamping on the maximum size of a
* keyframe. It is expressed as a percentage of the average
* per-frame bitrate, with the special (and default) value 0 meaning
* unlimited, or no additional clamping beyond the codec's built-in
* algorithm.
*
* For example, to allocate no more than 4.5 frames worth of bitrate
* to a keyframe, set this to 450.
*
* Supported in codecs: VP8, AV1
*/
AOME_SET_MAX_INTRA_BITRATE_PCT,
/*!\brief Codec control function to set reference and update frame flags.
*
* Supported in codecs: VP8
*/
AOME_SET_FRAME_FLAGS,
/*!\brief Codec control function to set max data rate for Inter frames.
*
* This value controls additional clamping on the maximum size of an
* inter frame. It is expressed as a percentage of the average
* per-frame bitrate, with the special (and default) value 0 meaning
* unlimited, or no additional clamping beyond the codec's built-in
* algorithm.
*
* For example, to allow no more than 4.5 frames worth of bitrate
* to an inter frame, set this to 450.
*
* Supported in codecs: AV1
*/
AV1E_SET_MAX_INTER_BITRATE_PCT,
/*!\brief Boost percentage for Golden Frame in CBR mode.
*
* This value controls the amount of boost given to Golden Frame in
* CBR mode. It is expressed as a percentage of the average
* per-frame bitrate, with the special (and default) value 0 meaning
* the feature is off, i.e., no golden frame boost in CBR mode and
* average bitrate target is used.
*
* For example, to allow 100% more bits, i.e, 2X, in a golden frame
* than average frame, set this to 100.
*
* Supported in codecs: AV1
*/
AV1E_SET_GF_CBR_BOOST_PCT,
/*!\brief Codec control function to set encoder screen content mode.
*
* 0: off, 1: On, 2: On with more aggressive rate control.
*
* Supported in codecs: VP8
*/
AOME_SET_SCREEN_CONTENT_MODE,
/*!\brief Codec control function to set lossless encoding mode.
*
* AV1 can operate in lossless encoding mode, in which the bitstream
* produced will be able to decode and reconstruct a perfect copy of
* input source. This control function provides a mean to switch encoder
* into lossless coding mode(1) or normal coding mode(0) that may be lossy.
* 0 = lossy coding mode
* 1 = lossless coding mode
*
* By default, encoder operates in normal coding mode (maybe lossy).
*
* Supported in codecs: AV1
*/
AV1E_SET_LOSSLESS,
#if CONFIG_AOM_QM
/*!\brief Codec control function to encode with quantisation matrices.
*
* AOM can operate with default quantisation matrices dependent on
* quantisation level and block type.
* 0 = do not use quantisation matrices
* 1 = use quantisation matrices
*
* By default, the encoder operates without quantisation matrices.
*
* Supported in codecs: AOM
*/
AV1E_SET_ENABLE_QM,
/*!\brief Codec control function to set the min quant matrix flatness.
*
* AOM can operate with different ranges of quantisation matrices.
* As quantisation levels increase, the matrices get flatter. This
* control sets the minimum level of flatness from which the matrices
* are determined.
*
* By default, the encoder sets this minimum at half the available
* range.
*
* Supported in codecs: AOM
*/
AV1E_SET_QM_MIN,
/*!\brief Codec control function to set the max quant matrix flatness.
*
* AOM can operate with different ranges of quantisation matrices.
* As quantisation levels increase, the matrices get flatter. This
* control sets the maximum level of flatness possible.
*
* By default, the encoder sets this maximum at the top of the
* available range.
*
* Supported in codecs: AOM
*/
AV1E_SET_QM_MAX,
#endif
/*!\brief Codec control function to set number of tile columns.
*
* In encoding and decoding, AV1 allows an input image frame be partitioned
* into separated vertical tile columns, which can be encoded or decoded
* independently. This enables easy implementation of parallel encoding and
* decoding. This control requests the encoder to use column tiles in
* encoding an input frame, with number of tile columns (in Log2 unit) as
* the parameter:
* 0 = 1 tile column
* 1 = 2 tile columns
* 2 = 4 tile columns
* .....
* n = 2**n tile columns
* The requested tile columns will be capped by encoder based on image size
* limitation (The minimum width of a tile column is 256 pixel, the maximum
* is 4096).
*
* By default, the value is 0, i.e. one single column tile for entire image.
*
* Supported in codecs: AV1
*/
AV1E_SET_TILE_COLUMNS,
/*!\brief Codec control function to set number of tile rows.
*
* In encoding and decoding, AV1 allows an input image frame be partitioned
* into separated horizontal tile rows. Tile rows are encoded or decoded
* sequentially. Even though encoding/decoding of later tile rows depends on
* earlier ones, this allows the encoder to output data packets for tile rows
* prior to completely processing all tile rows in a frame, thereby reducing
* the latency in processing between input and output. The parameter
* for this control describes the number of tile rows, which has a valid
* range [0, 2]:
* 0 = 1 tile row
* 1 = 2 tile rows
* 2 = 4 tile rows
*
* By default, the value is 0, i.e. one single row tile for entire image.
*
* Supported in codecs: AV1
*/
AV1E_SET_TILE_ROWS,
/*!\brief Codec control function to enable frame parallel decoding feature.
*
* AV1 has a bitstream feature to reduce decoding dependency between frames
* by turning off backward update of probability context used in encoding
* and decoding. This allows staged parallel processing of more than one
* video frames in the decoder. This control function provides a mean to
* turn this feature on or off for bitstreams produced by encoder.
*
* By default, this feature is off.
*
* Supported in codecs: AV1
*/
AV1E_SET_FRAME_PARALLEL_DECODING,
/*!\brief Codec control function to set adaptive quantization mode.
*
* AV1 has a segment based feature that allows encoder to adaptively change
* quantization parameter for each segment within a frame to improve the
* subjective quality. This control makes encoder operate in one of the
* several AQ_modes supported.
*
* By default, encoder operates with AQ_Mode 0(adaptive quantization off).
*
* Supported in codecs: AV1
*/
AV1E_SET_AQ_MODE,
/*!\brief Codec control function to enable/disable periodic Q boost.
*
* One AV1 encoder speed feature is to enable quality boost by lowering
* frame level Q periodically. This control function provides a mean to
* turn on/off this feature.
* 0 = off
* 1 = on
*
* By default, the encoder is allowed to use this feature for appropriate
* encoding modes.
*
* Supported in codecs: AV1
*/
AV1E_SET_FRAME_PERIODIC_BOOST,
/*!\brief Codec control function to set noise sensitivity.
*
* 0: off, 1: On(YOnly)
*
* Supported in codecs: AV1
*/
AV1E_SET_NOISE_SENSITIVITY,
/*!\brief Codec control function to set content type.
* \note Valid parameter range:
* AOM_CONTENT_DEFAULT = Regular video content (Default)
* AOM_CONTENT_SCREEN = Screen capture content
*
* Supported in codecs: AV1
*/
AV1E_SET_TUNE_CONTENT,
/*!\brief Codec control function to set color space info.
* \note Valid ranges: 0..7, default is "UNKNOWN".
* 0 = UNKNOWN,
* 1 = BT_601
* 2 = BT_709
* 3 = SMPTE_170
* 4 = SMPTE_240
* 5 = BT_2020
* 6 = RESERVED
* 7 = SRGB
*
* Supported in codecs: AV1
*/
AV1E_SET_COLOR_SPACE,
/*!\brief Codec control function to set minimum interval between GF/ARF frames
*
* By default the value is set as 4.
*
* Supported in codecs: AV1
*/
AV1E_SET_MIN_GF_INTERVAL,
/*!\brief Codec control function to set minimum interval between GF/ARF frames
*
* By default the value is set as 16.
*
* Supported in codecs: AV1
*/
AV1E_SET_MAX_GF_INTERVAL,
/*!\brief Codec control function to get an Active map back from the encoder.
*
* Supported in codecs: AV1
*/
AV1E_GET_ACTIVEMAP,
/*!\brief Codec control function to set color range bit.
* \note Valid ranges: 0..1, default is 0
* 0 = Limited range (16..235 or HBD equivalent)
* 1 = Full range (0..255 or HBD equivalent)
*
* Supported in codecs: AV1
*/
AV1E_SET_COLOR_RANGE,
/*!\brief Codec control function to set intended rendering image size.
*
* By default, this is identical to the image size in pixels.
*
* Supported in codecs: AV1
*/
AV1E_SET_RENDER_SIZE,
/*!\brief Codec control function to set target level.
*
* 255: off (default); 0: only keep level stats; 10: target for level 1.0;
* 11: target for level 1.1; ... 62: target for level 6.2
*
* Supported in codecs: AV1
*/
AV1E_SET_TARGET_LEVEL,
/*!\brief Codec control function to get bitstream level.
*
* Supported in codecs: AV1
*/
AV1E_GET_LEVEL,
/*!\brief Codec control function to set intended superblock size.
*
* By default, the superblock size is determined separately for each
* frame by the encoder.
*
* Supported in codecs: AV1
*/
AV1E_SET_SUPERBLOCK_SIZE,
};
/*!\brief aom 1-D scaling mode
*
* This set of constants define 1-D aom scaling modes
*/
typedef enum aom_scaling_mode_1d {
AOME_NORMAL = 0,
AOME_FOURFIVE = 1,
AOME_THREEFIVE = 2,
AOME_ONETWO = 3
} AOM_SCALING_MODE;
/*!\brief aom region of interest map
*
* These defines the data structures for the region of interest map
*
*/
typedef struct aom_roi_map {
/*! An id between 0 and 3 for each 16x16 region within a frame. */
unsigned char *roi_map;
unsigned int rows; /**< Number of rows. */
unsigned int cols; /**< Number of columns. */
// TODO(paulwilkins): broken for AV1 which has 8 segments
// q and loop filter deltas for each segment
// (see MAX_MB_SEGMENTS)
int delta_q[4]; /**< Quantizer deltas. */
int delta_lf[4]; /**< Loop filter deltas. */
/*! Static breakout threshold for each segment. */
unsigned int static_threshold[4];
} aom_roi_map_t;
/*!\brief aom active region map
*
* These defines the data structures for active region map
*
*/
typedef struct aom_active_map {
/*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
unsigned char *active_map;
unsigned int rows; /**< number of rows */
unsigned int cols; /**< number of cols */
} aom_active_map_t;
/*!\brief aom image scaling mode
*
* This defines the data structure for image scaling mode
*
*/
typedef struct aom_scaling_mode {
AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */
} aom_scaling_mode_t;
/*!\brief VP8 token partition mode
*
* This defines VP8 partitioning mode for compressed data, i.e., the number of
* sub-streams in the bitstream. Used for parallelized decoding.
*
*/
typedef enum {
AOM_ONE_TOKENPARTITION = 0,
AOM_TWO_TOKENPARTITION = 1,
AOM_FOUR_TOKENPARTITION = 2,
AOM_EIGHT_TOKENPARTITION = 3
} aome_token_partitions;
/*!brief AV1 encoder content type */
typedef enum {
AOM_CONTENT_DEFAULT,
AOM_CONTENT_SCREEN,
AOM_CONTENT_INVALID
} aom_tune_content;
/*!\brief VP8 model tuning parameters
*
* Changes the encoder to tune for certain types of input material.
*
*/
typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
/*!\cond */
/*!\brief VP8 encoder control function parameter type
*
* Defines the data types that VP8E control functions take. Note that
* additional common controls are defined in aom.h
*
*/
AOM_CTRL_USE_TYPE_DEPRECATED(AOME_USE_REFERENCE, int)
#define AOM_CTRL_AOME_USE_REFERENCE
AOM_CTRL_USE_TYPE(AOME_SET_FRAME_FLAGS, int)
#define AOM_CTRL_AOME_SET_FRAME_FLAGS
AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
#define AOM_CTRL_AOME_SET_ROI_MAP
AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
#define AOM_CTRL_AOME_SET_ACTIVEMAP
AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
#define AOM_CTRL_AOME_SET_SCALEMODE
AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
#define AOM_CTRL_AOME_SET_CPUUSED
AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
#if CONFIG_EXT_REFS
AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
#endif // CONFIG_EXT_REFS
AOM_CTRL_USE_TYPE(AOME_SET_NOISE_SENSITIVITY, unsigned int)
#define AOM_CTRL_AOME_SET_NOISE_SENSITIVITY
AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
#define AOM_CTRL_AOME_SET_SHARPNESS
AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
AOM_CTRL_USE_TYPE(AOME_SET_TOKEN_PARTITIONS, int) /* aome_token_partitions */
#define AOM_CTRL_AOME_SET_TOKEN_PARTITIONS
AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
#define AOM_CTRL_AOME_SET_ARNR_STRENGTH
AOM_CTRL_USE_TYPE_DEPRECATED(AOME_SET_ARNR_TYPE, unsigned int)
#define AOM_CTRL_AOME_SET_ARNR_TYPE
AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
#define AOM_CTRL_AOME_SET_TUNING
AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
#define AOM_CTRL_AOME_SET_CQ_LEVEL
AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
#define AOM_CTRL_AV1E_SET_TILE_ROWS
AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
AOM_CTRL_USE_TYPE(AOME_SET_SCREEN_CONTENT_MODE, unsigned int)
#define AOM_CTRL_AOME_SET_SCREEN_CONTENT_MODE
AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
#define AOM_CTRL_AV1E_SET_LOSSLESS
#if CONFIG_AOM_QM
AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
#define AOM_CTRL_AV1E_SET_ENABLE_QM
AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
#define AOM_CTRL_AV1E_SET_QM_MIN
AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
#define AOM_CTRL_AV1E_SET_QM_MAX
#endif
AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
#define AOM_CTRL_AV1E_SET_AQ_MODE
AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_SPACE, int)
#define AOM_CTRL_AV1E_SET_COLOR_SPACE
AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
#define AOM_CTRL_AV1E_GET_ACTIVEMAP
AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
#define AOM_CTRL_AV1E_SET_COLOR_RANGE
/*!\brief
*
* TODO(rbultje) : add support of the control in ffmpeg
*/
#define AOM_CTRL_AV1E_SET_RENDER_SIZE
AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
#define AOM_CTRL_AV1E_GET_LEVEL
/*!\endcond */
/*! @} - end defgroup vp8_encoder */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_AOMCX_H_

View File

@@ -1,191 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
* \ingroup aom
*
* @{
*/
/*!\file
* \brief Provides definitions for using AOM or AV1 within the aom Decoder
* interface.
*/
#ifndef AOM_AOMDX_H_
#define AOM_AOMDX_H_
#ifdef __cplusplus
extern "C" {
#endif
/* Include controls common to both the encoder and decoder */
#include "./aom.h"
/*!\name Algorithm interface for AV1
*
* This interface provides the capability to decode AV1 streams.
* @{
*/
extern aom_codec_iface_t aom_codec_av1_dx_algo;
extern aom_codec_iface_t *aom_codec_av1_dx(void);
/*!@} - end algorithm interface member group*/
/** Data structure that stores bit accounting for debug
*/
typedef struct Accounting Accounting;
/*!\enum aom_dec_control_id
* \brief AOM decoder control functions
*
* This set of macros define the control functions available for the AOM
* decoder interface.
*
* \sa #aom_codec_control
*/
enum aom_dec_control_id {
/** control function to get info on which reference frames were updated
* by the last decode
*/
AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
/** check if the indicated frame is corrupted */
AOMD_GET_FRAME_CORRUPTED,
/** control function to get info on which reference frames were used
* by the last decode
*/
AOMD_GET_LAST_REF_USED,
/** decryption function to decrypt encoded buffer data immediately
* before decoding. Takes a aom_decrypt_init, which contains
* a callback function and opaque context pointer.
*/
AOMD_SET_DECRYPTOR,
// AOMD_SET_DECRYPTOR = AOMD_SET_DECRYPTOR,
/** control function to get the dimensions that the current frame is decoded
* at. This may be different to the intended display size for the frame as
* specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
AV1D_GET_FRAME_SIZE,
/** control function to get the current frame's intended display dimensions
* (as specified in the wrapper or frame header). This may be different to
* the decoded dimensions of this frame (see AV1D_GET_FRAME_SIZE). */
AV1D_GET_DISPLAY_SIZE,
/** control function to get the bit depth of the stream. */
AV1D_GET_BIT_DEPTH,
/** control function to set the byte alignment of the planes in the reference
* buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
* legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
* follows Y plane, and V plane directly follows U plane. Default value is 0.
*/
AV1_SET_BYTE_ALIGNMENT,
/** control function to invert the decoding order to from right to left. The
* function is used in a test to confirm the decoding independence of tile
* columns. The function may be used in application where this order
* of decoding is desired.
*
* TODO(yaowu): Rework the unit test that uses this control, and in a future
* release, this test-only control shall be removed.
*/
AV1_INVERT_TILE_DECODE_ORDER,
/** control function to set the skip loop filter flag. Valid values are
* integers. The decoder will skip the loop filter when its value is set to
* nonzero. If the loop filter is skipped the decoder may accumulate decode
* artifacts. The default value is 0.
*/
AV1_SET_SKIP_LOOP_FILTER,
/** control function to retrieve a pointer to the Accounting struct. When
* compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
* If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
* The caller should ensure that AOM_CODEC_OK is returned before attempting
* to dereference the Accounting pointer.
*/
AV1_GET_ACCOUNTING,
AOM_DECODER_CTRL_ID_MAX,
/** control function to set the range of tile decoding. A value that is
* greater and equal to zero indicates only the specific row/column is
* decoded. A value that is -1 indicates the whole row/column is decoded.
* A special case is both values are -1 that means the whole frame is
* decoded.
*/
AV1_SET_DECODE_TILE_ROW,
AV1_SET_DECODE_TILE_COL
};
/** Decrypt n bytes of data from input -> output, using the decrypt_state
* passed in AOMD_SET_DECRYPTOR.
*/
typedef void (*aom_decrypt_cb)(void *decrypt_state, const unsigned char *input,
unsigned char *output, int count);
/*!\brief Structure to hold decryption state
*
* Defines a structure to hold the decryption state and access function.
*/
typedef struct aom_decrypt_init {
/*! Decrypt callback. */
aom_decrypt_cb decrypt_cb;
/*! Decryption state. */
void *decrypt_state;
} aom_decrypt_init;
/*!\brief A deprecated alias for aom_decrypt_init.
*/
typedef aom_decrypt_init aom_decrypt_init;
/*!\cond */
/*!\brief AOM decoder control function parameter type
*
* Defines the data types that AOMD control functions take. Note that
* additional common controls are defined in aom.h
*
*/
AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
#define AOM_CTRL_AOMD_GET_LAST_REF_USED
AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
#define AOM_CTRL_AOMD_SET_DECRYPTOR
// AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
//#define AOM_CTRL_AOMD_SET_DECRYPTOR
AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
#define AOM_CTRL_AV1D_GET_BIT_DEPTH
AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
#define AOM_CTRL_AV1D_GET_FRAME_SIZE
AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
#define AOM_CTRL_AV1_GET_ACCOUNTING
AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
/*!\endcond */
/*! @} - end defgroup aom_decoder */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_AOMDX_H_

View File

@@ -1,16 +0,0 @@
text aom_codec_build_config
text aom_codec_control_
text aom_codec_destroy
text aom_codec_err_to_string
text aom_codec_error
text aom_codec_error_detail
text aom_codec_get_caps
text aom_codec_iface_name
text aom_codec_version
text aom_codec_version_extra_str
text aom_codec_version_str
text aom_img_alloc
text aom_img_flip
text aom_img_free
text aom_img_set_rect
text aom_img_wrap

View File

@@ -1,8 +0,0 @@
text aom_codec_dec_init_ver
text aom_codec_decode
text aom_codec_get_frame
text aom_codec_get_stream_info
text aom_codec_peek_stream_info
text aom_codec_register_put_frame_cb
text aom_codec_register_put_slice_cb
text aom_codec_set_frame_buffer_functions

View File

@@ -1,9 +0,0 @@
text aom_codec_enc_config_default
text aom_codec_enc_config_set
text aom_codec_enc_init_multi_ver
text aom_codec_enc_init_ver
text aom_codec_encode
text aom_codec_get_cx_data
text aom_codec_get_global_headers
text aom_codec_get_preview_frame
text aom_codec_set_cx_data_buf

View File

@@ -1,465 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\file
* \brief Describes the decoder algorithm interface for algorithm
* implementations.
*
* This file defines the private structures and data types that are only
* relevant to implementing an algorithm, as opposed to using it.
*
* To create a decoder algorithm class, an interface structure is put
* into the global namespace:
* <pre>
* my_codec.c:
* aom_codec_iface_t my_codec = {
* "My Codec v1.0",
* AOM_CODEC_ALG_ABI_VERSION,
* ...
* };
* </pre>
*
* An application instantiates a specific decoder instance by using
* aom_codec_init() and a pointer to the algorithm's interface structure:
* <pre>
* my_app.c:
* extern aom_codec_iface_t my_codec;
* {
* aom_codec_ctx_t algo;
* res = aom_codec_init(&algo, &my_codec);
* }
* </pre>
*
* Once initialized, the instance is manged using other functions from
* the aom_codec_* family.
*/
#ifndef AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
#define AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
#include "./aom_config.h"
#include "../aom_decoder.h"
#include "../aom_encoder.h"
#include <stdarg.h>
#ifdef __cplusplus
extern "C" {
#endif
/*!\brief Current ABI version number
*
* \internal
* If this file is altered in any way that changes the ABI, this value
* must be bumped. Examples include, but are not limited to, changing
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
#define AOM_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
typedef struct aom_codec_priv_enc_mr_cfg aom_codec_priv_enc_mr_cfg_t;
/*!\brief init function pointer prototype
*
* Performs algorithm-specific initialization of the decoder context. This
* function is called by the generic aom_codec_init() wrapper function, so
* plugins implementing this interface may trust the input parameters to be
* properly initialized.
*
* \param[in] ctx Pointer to this instance's context
* \retval #AOM_CODEC_OK
* The input stream was recognized and decoder initialized.
* \retval #AOM_CODEC_MEM_ERROR
* Memory operation failed.
*/
typedef aom_codec_err_t (*aom_codec_init_fn_t)(
aom_codec_ctx_t *ctx, aom_codec_priv_enc_mr_cfg_t *data);
/*!\brief destroy function pointer prototype
*
* Performs algorithm-specific destruction of the decoder context. This
* function is called by the generic aom_codec_destroy() wrapper function,
* so plugins implementing this interface may trust the input parameters
* to be properly initialized.
*
* \param[in] ctx Pointer to this instance's context
* \retval #AOM_CODEC_OK
* The input stream was recognized and decoder initialized.
* \retval #AOM_CODEC_MEM_ERROR
* Memory operation failed.
*/
typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx);
/*!\brief parse stream info function pointer prototype
*
* Performs high level parsing of the bitstream. This function is called by the
* generic aom_codec_peek_stream_info() wrapper function, so plugins
* implementing this interface may trust the input parameters to be properly
* initialized.
*
* \param[in] data Pointer to a block of data to parse
* \param[in] data_sz Size of the data buffer
* \param[in,out] si Pointer to stream info to update. The size member
* \ref MUST be properly initialized, but \ref MAY be
* clobbered by the algorithm. This parameter \ref MAY
* be NULL.
*
* \retval #AOM_CODEC_OK
* Bitstream is parsable and stream information updated
*/
typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
unsigned int data_sz,
aom_codec_stream_info_t *si);
/*!\brief Return information about the current stream.
*
* Returns information about the stream that has been parsed during decoding.
*
* \param[in] ctx Pointer to this instance's context
* \param[in,out] si Pointer to stream info to update. The size member
* \ref MUST be properly initialized, but \ref MAY be
* clobbered by the algorithm. This parameter \ref MAY
* be NULL.
*
* \retval #AOM_CODEC_OK
* Bitstream is parsable and stream information updated
*/
typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx,
aom_codec_stream_info_t *si);
/*!\brief control function pointer prototype
*
* This function is used to exchange algorithm specific data with the decoder
* instance. This can be used to implement features specific to a particular
* algorithm.
*
* This function is called by the generic aom_codec_control() wrapper
* function, so plugins implementing this interface may trust the input
* parameters to be properly initialized. However, this interface does not
* provide type safety for the exchanged data or assign meanings to the
* control codes. Those details should be specified in the algorithm's
* header file. In particular, the ctrl_id parameter is guaranteed to exist
* in the algorithm's control mapping table, and the data parameter may be NULL.
*
*
* \param[in] ctx Pointer to this instance's context
* \param[in] ctrl_id Algorithm specific control identifier
* \param[in,out] data Data to exchange with algorithm instance.
*
* \retval #AOM_CODEC_OK
* The internal state data was deserialized.
*/
typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx,
va_list ap);
/*!\brief control function pointer mapping
*
* This structure stores the mapping between control identifiers and
* implementing functions. Each algorithm provides a list of these
* mappings. This list is searched by the aom_codec_control() wrapper
* function to determine which function to invoke. The special
* value {0, NULL} is used to indicate end-of-list, and must be
* present. The special value {0, <non-null>} can be used as a catch-all
* mapping. This implies that ctrl_id values chosen by the algorithm
* \ref MUST be non-zero.
*/
typedef const struct aom_codec_ctrl_fn_map {
int ctrl_id;
aom_codec_control_fn_t fn;
} aom_codec_ctrl_fn_map_t;
/*!\brief decode data function pointer prototype
*
* Processes a buffer of coded data. If the processing results in a new
* decoded frame becoming available, #AOM_CODEC_CB_PUT_SLICE and
* #AOM_CODEC_CB_PUT_FRAME events are generated as appropriate. This
* function is called by the generic aom_codec_decode() wrapper function,
* so plugins implementing this interface may trust the input parameters
* to be properly initialized.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] data Pointer to this block of new coded data. If
* NULL, a #AOM_CODEC_CB_PUT_FRAME event is posted
* for the previously decoded frame.
* \param[in] data_sz Size of the coded data, in bytes.
*
* \return Returns #AOM_CODEC_OK if the coded data was processed completely
* and future pictures can be decoded without error. Otherwise,
* see the descriptions of the other error codes in ::aom_codec_err_t
* for recoverability capabilities.
*/
typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx,
const uint8_t *data,
unsigned int data_sz,
void *user_priv,
long deadline);
/*!\brief Decoded frames iterator
*
* Iterates over a list of the frames available for display. The iterator
* storage should be initialized to NULL to start the iteration. Iteration is
* complete when this function returns NULL.
*
* The list of available frames becomes valid upon completion of the
* aom_codec_decode call, and remains valid until the next call to
* aom_codec_decode.
*
* \param[in] ctx Pointer to this instance's context
* \param[in out] iter Iterator storage, initialized to NULL
*
* \return Returns a pointer to an image, if one is ready for display. Frames
* produced will always be in PTS (presentation time stamp) order.
*/
typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx,
aom_codec_iter_t *iter);
/*!\brief Pass in external frame buffers for the decoder to use.
*
* Registers functions to be called when libaom needs a frame buffer
* to decode the current frame and a function to be called when libaom does
* not internally reference the frame buffer. This set function must
* be called before the first call to decode or libaom will assume the
* default behavior of allocating frame buffers internally.
*
* \param[in] ctx Pointer to this instance's context
* \param[in] cb_get Pointer to the get callback function
* \param[in] cb_release Pointer to the release callback function
* \param[in] cb_priv Callback's private data
*
* \retval #AOM_CODEC_OK
* External frame buffers will be used by libaom.
* \retval #AOM_CODEC_INVALID_PARAM
* One or more of the callbacks were NULL.
* \retval #AOM_CODEC_ERROR
* Decoder context not initialized, or algorithm not capable of
* using external frame buffers.
*
* \note
* When decoding AV1, the application may be required to pass in at least
* #AOM_MAXIMUM_WORK_BUFFERS external frame
* buffers.
*/
typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)(
aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx,
const aom_image_t *img,
aom_codec_pts_t pts,
unsigned long duration,
aom_enc_frame_flags_t flags,
unsigned long deadline);
typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)(
aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter);
typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)(
aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg);
typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
aom_codec_alg_priv_t *ctx);
typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
aom_codec_alg_priv_t *ctx);
typedef aom_codec_err_t (*aom_codec_enc_mr_get_mem_loc_fn_t)(
const aom_codec_enc_cfg_t *cfg, void **mem_loc);
/*!\brief usage configuration mapping
*
* This structure stores the mapping between usage identifiers and
* configuration structures. Each algorithm provides a list of these
* mappings. This list is searched by the aom_codec_enc_config_default()
* wrapper function to determine which config to return. The special value
* {-1, {0}} is used to indicate end-of-list, and must be present. At least
* one mapping must be present, in addition to the end-of-list.
*
*/
typedef const struct aom_codec_enc_cfg_map {
int usage;
aom_codec_enc_cfg_t cfg;
} aom_codec_enc_cfg_map_t;
/*!\brief Decoder algorithm interface interface
*
* All decoders \ref MUST expose a variable of this type.
*/
struct aom_codec_iface {
const char *name; /**< Identification String */
int abi_version; /**< Implemented ABI version */
aom_codec_caps_t caps; /**< Decoder capabilities */
aom_codec_init_fn_t init; /**< \copydoc ::aom_codec_init_fn_t */
aom_codec_destroy_fn_t destroy; /**< \copydoc ::aom_codec_destroy_fn_t */
aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */
struct aom_codec_dec_iface {
aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */
aom_codec_get_si_fn_t get_si; /**< \copydoc ::aom_codec_get_si_fn_t */
aom_codec_decode_fn_t decode; /**< \copydoc ::aom_codec_decode_fn_t */
aom_codec_get_frame_fn_t
get_frame; /**< \copydoc ::aom_codec_get_frame_fn_t */
aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */
} dec;
struct aom_codec_enc_iface {
int cfg_map_count;
aom_codec_enc_cfg_map_t
*cfg_maps; /**< \copydoc ::aom_codec_enc_cfg_map_t */
aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */
aom_codec_get_cx_data_fn_t
get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */
aom_codec_enc_config_set_fn_t
cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */
aom_codec_get_global_headers_fn_t
get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */
aom_codec_get_preview_frame_fn_t
get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
aom_codec_enc_mr_get_mem_loc_fn_t
mr_get_mem_loc; /**< \copydoc ::aom_codec_enc_mr_get_mem_loc_fn_t */
} enc;
};
/*!\brief Callback function pointer / user data pair storage */
typedef struct aom_codec_priv_cb_pair {
union {
aom_codec_put_frame_cb_fn_t put_frame;
aom_codec_put_slice_cb_fn_t put_slice;
} u;
void *user_priv;
} aom_codec_priv_cb_pair_t;
/*!\brief Instance private storage
*
* This structure is allocated by the algorithm's init function. It can be
* extended in one of two ways. First, a second, algorithm specific structure
* can be allocated and the priv member pointed to it. Alternatively, this
* structure can be made the first member of the algorithm specific structure,
* and the pointer cast to the proper type.
*/
struct aom_codec_priv {
const char *err_detail;
aom_codec_flags_t init_flags;
struct {
aom_codec_priv_cb_pair_t put_frame_cb;
aom_codec_priv_cb_pair_t put_slice_cb;
} dec;
struct {
aom_fixed_buf_t cx_data_dst_buf;
unsigned int cx_data_pad_before;
unsigned int cx_data_pad_after;
aom_codec_cx_pkt_t cx_data_pkt;
unsigned int total_encoders;
} enc;
};
/*
* Multi-resolution encoding internal configuration
*/
struct aom_codec_priv_enc_mr_cfg {
unsigned int mr_total_resolutions;
unsigned int mr_encoder_id;
struct aom_rational mr_down_sampling_factor;
void *mr_low_res_mode_info;
};
#undef AOM_CTRL_USE_TYPE
#define AOM_CTRL_USE_TYPE(id, typ) \
static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
#undef AOM_CTRL_USE_TYPE_DEPRECATED
#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ) \
static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
#define CAST(id, arg) id##__value(arg)
/* CODEC_INTERFACE convenience macro
*
* By convention, each codec interface is a struct with extern linkage, where
* the symbol is suffixed with _algo. A getter function is also defined to
* return a pointer to the struct, since in some cases it's easier to work
* with text symbols than data symbols (see issue #169). This function has
* the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
* macro is provided to define this getter function automatically.
*/
#define CODEC_INTERFACE(id) \
aom_codec_iface_t *id(void) { return &id##_algo; } \
aom_codec_iface_t id##_algo
/* Internal Utility Functions
*
* The following functions are intended to be used inside algorithms as
* utilities for manipulating aom_codec_* data structures.
*/
struct aom_codec_pkt_list {
unsigned int cnt;
unsigned int max;
struct aom_codec_cx_pkt pkts[1];
};
#define aom_codec_pkt_list_decl(n) \
union { \
struct aom_codec_pkt_list head; \
struct { \
struct aom_codec_pkt_list head; \
struct aom_codec_cx_pkt pkts[n]; \
} alloc; \
}
#define aom_codec_pkt_list_init(m) \
(m)->alloc.head.cnt = 0, \
(m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
int aom_codec_pkt_list_add(struct aom_codec_pkt_list *,
const struct aom_codec_cx_pkt *);
const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
struct aom_codec_pkt_list *list, aom_codec_iter_t *iter);
#include <stdio.h>
#include <setjmp.h>
struct aom_internal_error_info {
aom_codec_err_t error_code;
int has_detail;
char detail[80];
int setjmp;
jmp_buf jmp;
};
#define CLANG_ANALYZER_NORETURN
#if defined(__has_feature)
#if __has_feature(attribute_analyzer_noreturn)
#undef CLANG_ANALYZER_NORETURN
#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
#endif
#endif
void aom_internal_error(struct aom_internal_error_info *info,
aom_codec_err_t error, const char *fmt,
...) CLANG_ANALYZER_NORETURN;
#if CONFIG_DEBUG
#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \
do { \
lval = (expr); \
if (!lval) \
aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
"Failed to allocate " #lval " at %s:%d", __FILE__, \
__LINE__); \
} while (0)
#else
#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \
do { \
lval = (expr); \
if (!lval) \
aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
"Failed to allocate " #lval); \
} while (0)
#endif
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_INTERNAL_AOM_CODEC_INTERNAL_H_

View File

@@ -1,134 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\file
* \brief Provides the high level interface to wrap decoder algorithms.
*
*/
#include <stdarg.h>
#include <stdlib.h>
#include "aom/aom_integer.h"
#include "aom/internal/aom_codec_internal.h"
#include "aom_version.h"
#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
int aom_codec_version(void) { return VERSION_PACKED; }
const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
return iface ? iface->name : "<invalid interface>";
}
const char *aom_codec_err_to_string(aom_codec_err_t err) {
switch (err) {
case AOM_CODEC_OK: return "Success";
case AOM_CODEC_ERROR: return "Unspecified internal error";
case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
case AOM_CODEC_INCAPABLE:
return "Codec does not implement requested capability";
case AOM_CODEC_UNSUP_BITSTREAM:
return "Bitstream not supported by this decoder";
case AOM_CODEC_UNSUP_FEATURE:
return "Bitstream required feature not supported by this decoder";
case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
case AOM_CODEC_LIST_END: return "End of iterated list";
}
return "Unrecognized error code";
}
const char *aom_codec_error(aom_codec_ctx_t *ctx) {
return (ctx) ? aom_codec_err_to_string(ctx->err)
: aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
}
const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
if (ctx && ctx->err)
return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
return NULL;
}
aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
aom_codec_err_t res;
if (!ctx)
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv)
res = AOM_CODEC_ERROR;
else {
ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
ctx->iface = NULL;
ctx->name = NULL;
ctx->priv = NULL;
res = AOM_CODEC_OK;
}
return SAVE_STATUS(ctx, res);
}
aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
return (iface) ? iface->caps : 0;
}
aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
aom_codec_err_t res;
if (!ctx || !ctrl_id)
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
res = AOM_CODEC_ERROR;
else {
aom_codec_ctrl_fn_map_t *entry;
res = AOM_CODEC_ERROR;
for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
va_list ap;
va_start(ap, ctrl_id);
res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
va_end(ap);
break;
}
}
}
return SAVE_STATUS(ctx, res);
}
void aom_internal_error(struct aom_internal_error_info *info,
aom_codec_err_t error, const char *fmt, ...) {
va_list ap;
info->error_code = error;
info->has_detail = 0;
if (fmt) {
size_t sz = sizeof(info->detail);
info->has_detail = 1;
va_start(ap, fmt);
vsnprintf(info->detail, sz - 1, fmt, ap);
va_end(ap);
info->detail[sz - 1] = '\0';
}
if (info->setjmp) longjmp(info->jmp, info->error_code);
}

View File

@@ -1,189 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\file
* \brief Provides the high level interface to wrap decoder algorithms.
*
*/
#include <string.h>
#include "aom/internal/aom_codec_internal.h"
#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
return (aom_codec_alg_priv_t *)ctx->priv;
}
aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
aom_codec_iface_t *iface,
const aom_codec_dec_cfg_t *cfg,
aom_codec_flags_t flags, int ver) {
aom_codec_err_t res;
if (ver != AOM_DECODER_ABI_VERSION)
res = AOM_CODEC_ABI_MISMATCH;
else if (!ctx || !iface)
res = AOM_CODEC_INVALID_PARAM;
else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
res = AOM_CODEC_ABI_MISMATCH;
else if ((flags & AOM_CODEC_USE_POSTPROC) &&
!(iface->caps & AOM_CODEC_CAP_POSTPROC))
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_ERROR_CONCEALMENT) &&
!(iface->caps & AOM_CODEC_CAP_ERROR_CONCEALMENT))
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_INPUT_FRAGMENTS) &&
!(iface->caps & AOM_CODEC_CAP_INPUT_FRAGMENTS))
res = AOM_CODEC_INCAPABLE;
else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
res = AOM_CODEC_INCAPABLE;
else {
memset(ctx, 0, sizeof(*ctx));
ctx->iface = iface;
ctx->name = iface->name;
ctx->priv = NULL;
ctx->init_flags = flags;
ctx->config.dec = cfg;
res = ctx->iface->init(ctx, NULL);
if (res) {
ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
aom_codec_destroy(ctx);
}
}
return SAVE_STATUS(ctx, res);
}
aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
const uint8_t *data,
unsigned int data_sz,
aom_codec_stream_info_t *si) {
aom_codec_err_t res;
if (!iface || !data || !data_sz || !si ||
si->sz < sizeof(aom_codec_stream_info_t))
res = AOM_CODEC_INVALID_PARAM;
else {
/* Set default/unknown values */
si->w = 0;
si->h = 0;
res = iface->dec.peek_si(data, data_sz, si);
}
return res;
}
aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
aom_codec_stream_info_t *si) {
aom_codec_err_t res;
if (!ctx || !si || si->sz < sizeof(aom_codec_stream_info_t))
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv)
res = AOM_CODEC_ERROR;
else {
/* Set default/unknown values */
si->w = 0;
si->h = 0;
res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
}
return SAVE_STATUS(ctx, res);
}
aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
unsigned int data_sz, void *user_priv,
long deadline) {
aom_codec_err_t res;
/* Sanity checks */
/* NULL data ptr allowed if data_sz is 0 too */
if (!ctx || (!data && data_sz) || (data && !data_sz))
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv)
res = AOM_CODEC_ERROR;
else {
res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
deadline);
}
return SAVE_STATUS(ctx, res);
}
aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
aom_image_t *img;
if (!ctx || !iter || !ctx->iface || !ctx->priv)
img = NULL;
else
img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
return img;
}
aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
aom_codec_put_frame_cb_fn_t cb,
void *user_priv) {
aom_codec_err_t res;
if (!ctx || !cb)
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv ||
!(ctx->iface->caps & AOM_CODEC_CAP_PUT_FRAME))
res = AOM_CODEC_ERROR;
else {
ctx->priv->dec.put_frame_cb.u.put_frame = cb;
ctx->priv->dec.put_frame_cb.user_priv = user_priv;
res = AOM_CODEC_OK;
}
return SAVE_STATUS(ctx, res);
}
aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
aom_codec_put_slice_cb_fn_t cb,
void *user_priv) {
aom_codec_err_t res;
if (!ctx || !cb)
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv ||
!(ctx->iface->caps & AOM_CODEC_CAP_PUT_SLICE))
res = AOM_CODEC_ERROR;
else {
ctx->priv->dec.put_slice_cb.u.put_slice = cb;
ctx->priv->dec.put_slice_cb.user_priv = user_priv;
res = AOM_CODEC_OK;
}
return SAVE_STATUS(ctx, res);
}
aom_codec_err_t aom_codec_set_frame_buffer_functions(
aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
aom_codec_err_t res;
if (!ctx || !cb_get || !cb_release) {
res = AOM_CODEC_INVALID_PARAM;
} else if (!ctx->iface || !ctx->priv ||
!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
res = AOM_CODEC_ERROR;
} else {
res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
cb_priv);
}
return SAVE_STATUS(ctx, res);
}

View File

@@ -1,380 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
/*!\file
* \brief Provides the high level interface to wrap encoder algorithms.
*
*/
#include <limits.h>
#include <string.h>
#include "aom_config.h"
#include "aom/internal/aom_codec_internal.h"
#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
return (aom_codec_alg_priv_t *)ctx->priv;
}
aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
aom_codec_iface_t *iface,
const aom_codec_enc_cfg_t *cfg,
aom_codec_flags_t flags, int ver) {
aom_codec_err_t res;
if (ver != AOM_ENCODER_ABI_VERSION)
res = AOM_CODEC_ABI_MISMATCH;
else if (!ctx || !iface || !cfg)
res = AOM_CODEC_INVALID_PARAM;
else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
res = AOM_CODEC_ABI_MISMATCH;
else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
!(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
res = AOM_CODEC_INCAPABLE;
else {
ctx->iface = iface;
ctx->name = iface->name;
ctx->priv = NULL;
ctx->init_flags = flags;
ctx->config.enc = cfg;
res = ctx->iface->init(ctx, NULL);
if (res) {
ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
aom_codec_destroy(ctx);
}
}
return SAVE_STATUS(ctx, res);
}
aom_codec_err_t aom_codec_enc_init_multi_ver(
aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver) {
aom_codec_err_t res = AOM_CODEC_OK;
if (ver != AOM_ENCODER_ABI_VERSION)
res = AOM_CODEC_ABI_MISMATCH;
else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
res = AOM_CODEC_INVALID_PARAM;
else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
res = AOM_CODEC_ABI_MISMATCH;
else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
res = AOM_CODEC_INCAPABLE;
else if ((flags & AOM_CODEC_USE_OUTPUT_PARTITION) &&
!(iface->caps & AOM_CODEC_CAP_OUTPUT_PARTITION))
res = AOM_CODEC_INCAPABLE;
else {
int i;
void *mem_loc = NULL;
if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
for (i = 0; i < num_enc; i++) {
aom_codec_priv_enc_mr_cfg_t mr_cfg;
/* Validate down-sampling factor. */
if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
dsf->den > dsf->num) {
res = AOM_CODEC_INVALID_PARAM;
break;
}
mr_cfg.mr_low_res_mode_info = mem_loc;
mr_cfg.mr_total_resolutions = num_enc;
mr_cfg.mr_encoder_id = num_enc - 1 - i;
mr_cfg.mr_down_sampling_factor.num = dsf->num;
mr_cfg.mr_down_sampling_factor.den = dsf->den;
/* Force Key-frame synchronization. Namely, encoder at higher
* resolution always use the same frame_type chosen by the
* lowest-resolution encoder.
*/
if (mr_cfg.mr_encoder_id) cfg->kf_mode = AOM_KF_DISABLED;
ctx->iface = iface;
ctx->name = iface->name;
ctx->priv = NULL;
ctx->init_flags = flags;
ctx->config.enc = cfg;
res = ctx->iface->init(ctx, &mr_cfg);
if (res) {
const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
/* Destroy current ctx */
ctx->err_detail = error_detail;
aom_codec_destroy(ctx);
/* Destroy already allocated high-level ctx */
while (i) {
ctx--;
ctx->err_detail = error_detail;
aom_codec_destroy(ctx);
i--;
}
}
if (res) break;
ctx++;
cfg++;
dsf++;
}
ctx--;
}
}
return SAVE_STATUS(ctx, res);
}
aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
aom_codec_enc_cfg_t *cfg,
unsigned int usage) {
aom_codec_err_t res;
aom_codec_enc_cfg_map_t *map;
int i;
if (!iface || !cfg || usage > INT_MAX)
res = AOM_CODEC_INVALID_PARAM;
else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
res = AOM_CODEC_INCAPABLE;
else {
res = AOM_CODEC_INVALID_PARAM;
for (i = 0; i < iface->enc.cfg_map_count; ++i) {
map = iface->enc.cfg_maps + i;
if (map->usage == (int)usage) {
*cfg = map->cfg;
cfg->g_usage = usage;
res = AOM_CODEC_OK;
break;
}
}
}
return res;
}
#if ARCH_X86 || ARCH_X86_64
/* On X86, disable the x87 unit's internal 80 bit precision for better
* consistency with the SSE unit's 64 bit precision.
*/
#include "aom_ports/x86.h"
#define FLOATING_POINT_INIT() \
do { \
unsigned short x87_orig_mode = x87_set_double_precision();
#define FLOATING_POINT_RESTORE() \
x87_set_control_word(x87_orig_mode); \
} \
while (0)
#else
static void FLOATING_POINT_INIT() {}
static void FLOATING_POINT_RESTORE() {}
#endif
aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
aom_codec_pts_t pts, unsigned long duration,
aom_enc_frame_flags_t flags,
unsigned long deadline) {
aom_codec_err_t res = AOM_CODEC_OK;
if (!ctx || (img && !duration))
res = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv)
res = AOM_CODEC_ERROR;
else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
res = AOM_CODEC_INCAPABLE;
else {
unsigned int num_enc = ctx->priv->enc.total_encoders;
/* Execute in a normalized floating point environment, if the platform
* requires it.
*/
FLOATING_POINT_INIT();
if (num_enc == 1)
res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags,
deadline);
else {
/* Multi-resolution encoding:
* Encode multi-levels in reverse order. For example,
* if mr_total_resolutions = 3, first encode level 2,
* then encode level 1, and finally encode level 0.
*/
int i;
ctx += num_enc - 1;
if (img) img += num_enc - 1;
for (i = num_enc - 1; i >= 0; i--) {
if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration,
flags, deadline)))
break;
ctx--;
if (img) img--;
}
ctx++;
}
FLOATING_POINT_RESTORE();
}
return SAVE_STATUS(ctx, res);
}
const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
aom_codec_iter_t *iter) {
const aom_codec_cx_pkt_t *pkt = NULL;
if (ctx) {
if (!iter)
ctx->err = AOM_CODEC_INVALID_PARAM;
else if (!ctx->iface || !ctx->priv)
ctx->err = AOM_CODEC_ERROR;
else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
ctx->err = AOM_CODEC_INCAPABLE;
else
pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
}
if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
// If the application has specified a destination area for the
// compressed data, and the codec has not placed the data there,
// and it fits, copy it.
aom_codec_priv_t *const priv = ctx->priv;
char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
if (dst_buf && pkt->data.raw.buf != dst_buf &&
pkt->data.raw.sz + priv->enc.cx_data_pad_before +
priv->enc.cx_data_pad_after <=
priv->enc.cx_data_dst_buf.sz) {
aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
pkt->data.raw.sz);
*modified_pkt = *pkt;
modified_pkt->data.raw.buf = dst_buf;
modified_pkt->data.raw.sz +=
priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after;
pkt = modified_pkt;
}
if (dst_buf == pkt->data.raw.buf) {
priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
}
}
return pkt;
}
aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
const aom_fixed_buf_t *buf,
unsigned int pad_before,
unsigned int pad_after) {
if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM;
if (buf) {
ctx->priv->enc.cx_data_dst_buf = *buf;
ctx->priv->enc.cx_data_pad_before = pad_before;
ctx->priv->enc.cx_data_pad_after = pad_after;
} else {
ctx->priv->enc.cx_data_dst_buf.buf = NULL;
ctx->priv->enc.cx_data_dst_buf.sz = 0;
ctx->priv->enc.cx_data_pad_before = 0;
ctx->priv->enc.cx_data_pad_after = 0;
}
return AOM_CODEC_OK;
}
const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) {
aom_image_t *img = NULL;
if (ctx) {
if (!ctx->iface || !ctx->priv)
ctx->err = AOM_CODEC_ERROR;
else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
ctx->err = AOM_CODEC_INCAPABLE;
else if (!ctx->iface->enc.get_preview)
ctx->err = AOM_CODEC_INCAPABLE;
else
img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
}
return img;
}
aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) {
aom_fixed_buf_t *buf = NULL;
if (ctx) {
if (!ctx->iface || !ctx->priv)
ctx->err = AOM_CODEC_ERROR;
else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
ctx->err = AOM_CODEC_INCAPABLE;
else if (!ctx->iface->enc.get_glob_hdrs)
ctx->err = AOM_CODEC_INCAPABLE;
else
buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
}
return buf;
}
aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
const aom_codec_enc_cfg_t *cfg) {
aom_codec_err_t res;
if (!ctx || !ctx->iface || !ctx->priv || !cfg)
res = AOM_CODEC_INVALID_PARAM;
else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
res = AOM_CODEC_INCAPABLE;
else
res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
return SAVE_STATUS(ctx, res);
}
int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list,
const struct aom_codec_cx_pkt *pkt) {
if (list->cnt < list->max) {
list->pkts[list->cnt++] = *pkt;
return 0;
}
return 1;
}
const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) {
const aom_codec_cx_pkt_t *pkt;
if (!(*iter)) {
*iter = list->pkts;
}
pkt = (const aom_codec_cx_pkt_t *)*iter;
if ((size_t)(pkt - list->pkts) < list->cnt)
*iter = pkt + 1;
else
pkt = NULL;
return pkt;
}

View File

@@ -1,240 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <stdlib.h>
#include <string.h>
#include "aom/aom_image.h"
#include "aom/aom_integer.h"
#include "aom_mem/aom_mem.h"
static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
unsigned int d_w, unsigned int d_h,
unsigned int buf_align,
unsigned int stride_align,
unsigned char *img_data) {
unsigned int h, w, s, xcs, ycs, bps;
unsigned int stride_in_bytes;
int align;
/* Treat align==0 like align==1 */
if (!buf_align) buf_align = 1;
/* Validate alignment (must be power of 2) */
if (buf_align & (buf_align - 1)) goto fail;
/* Treat align==0 like align==1 */
if (!stride_align) stride_align = 1;
/* Validate alignment (must be power of 2) */
if (stride_align & (stride_align - 1)) goto fail;
/* Get sample size for this format */
switch (fmt) {
case AOM_IMG_FMT_RGB32:
case AOM_IMG_FMT_RGB32_LE:
case AOM_IMG_FMT_ARGB:
case AOM_IMG_FMT_ARGB_LE: bps = 32; break;
case AOM_IMG_FMT_RGB24:
case AOM_IMG_FMT_BGR24: bps = 24; break;
case AOM_IMG_FMT_RGB565:
case AOM_IMG_FMT_RGB565_LE:
case AOM_IMG_FMT_RGB555:
case AOM_IMG_FMT_RGB555_LE:
case AOM_IMG_FMT_UYVY:
case AOM_IMG_FMT_YUY2:
case AOM_IMG_FMT_YVYU: bps = 16; break;
case AOM_IMG_FMT_I420:
case AOM_IMG_FMT_YV12:
case AOM_IMG_FMT_AOMI420:
case AOM_IMG_FMT_AOMYV12: bps = 12; break;
case AOM_IMG_FMT_I422:
case AOM_IMG_FMT_I440: bps = 16; break;
case AOM_IMG_FMT_I444: bps = 24; break;
case AOM_IMG_FMT_I42016: bps = 24; break;
case AOM_IMG_FMT_I42216:
case AOM_IMG_FMT_I44016: bps = 32; break;
case AOM_IMG_FMT_I44416: bps = 48; break;
default: bps = 16; break;
}
/* Get chroma shift values for this format */
switch (fmt) {
case AOM_IMG_FMT_I420:
case AOM_IMG_FMT_YV12:
case AOM_IMG_FMT_AOMI420:
case AOM_IMG_FMT_AOMYV12:
case AOM_IMG_FMT_I422:
case AOM_IMG_FMT_I42016:
case AOM_IMG_FMT_I42216: xcs = 1; break;
default: xcs = 0; break;
}
switch (fmt) {
case AOM_IMG_FMT_I420:
case AOM_IMG_FMT_I440:
case AOM_IMG_FMT_YV12:
case AOM_IMG_FMT_AOMI420:
case AOM_IMG_FMT_AOMYV12:
case AOM_IMG_FMT_I42016:
case AOM_IMG_FMT_I44016: ycs = 1; break;
default: ycs = 0; break;
}
/* Calculate storage sizes given the chroma subsampling */
align = (1 << xcs) - 1;
w = (d_w + align) & ~align;
align = (1 << ycs) - 1;
h = (d_h + align) & ~align;
s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
s = (s + stride_align - 1) & ~(stride_align - 1);
stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
/* Allocate the new image */
if (!img) {
img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
if (!img) goto fail;
img->self_allocd = 1;
} else {
memset(img, 0, sizeof(aom_image_t));
}
img->img_data = img_data;
if (!img_data) {
const uint64_t alloc_size = (fmt & AOM_IMG_FMT_PLANAR)
? (uint64_t)h * s * bps / 8
: (uint64_t)h * s;
if (alloc_size != (size_t)alloc_size) goto fail;
img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
img->img_data_owner = 1;
}
if (!img->img_data) goto fail;
img->fmt = fmt;
img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
img->w = w;
img->h = h;
img->x_chroma_shift = xcs;
img->y_chroma_shift = ycs;
img->bps = bps;
/* Calculate strides */
img->stride[AOM_PLANE_Y] = img->stride[AOM_PLANE_ALPHA] = stride_in_bytes;
img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
/* Default viewport to entire image */
if (!aom_img_set_rect(img, 0, 0, d_w, d_h)) return img;
fail:
aom_img_free(img);
return NULL;
}
aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
unsigned int d_w, unsigned int d_h,
unsigned int align) {
return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
}
aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
unsigned int d_h, unsigned int stride_align,
unsigned char *img_data) {
/* By setting buf_align = 1, we don't change buffer alignment in this
* function. */
return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
}
int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
unsigned int w, unsigned int h) {
unsigned char *data;
if (x + w <= img->w && y + h <= img->h) {
img->d_w = w;
img->d_h = h;
/* Calculate plane pointers */
if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
img->planes[AOM_PLANE_PACKED] =
img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
} else {
const int bytes_per_sample =
(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
data = img->img_data;
if (img->fmt & AOM_IMG_FMT_HAS_ALPHA) {
img->planes[AOM_PLANE_ALPHA] =
data + x * bytes_per_sample + y * img->stride[AOM_PLANE_ALPHA];
data += img->h * img->stride[AOM_PLANE_ALPHA];
}
img->planes[AOM_PLANE_Y] =
data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
data += img->h * img->stride[AOM_PLANE_Y];
if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
img->planes[AOM_PLANE_U] =
data + (x >> img->x_chroma_shift) * bytes_per_sample +
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
img->planes[AOM_PLANE_V] =
data + (x >> img->x_chroma_shift) * bytes_per_sample +
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
} else {
img->planes[AOM_PLANE_V] =
data + (x >> img->x_chroma_shift) * bytes_per_sample +
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
img->planes[AOM_PLANE_U] =
data + (x >> img->x_chroma_shift) * bytes_per_sample +
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
}
}
return 0;
}
return -1;
}
void aom_img_flip(aom_image_t *img) {
/* Note: In the calculation pointer adjustment calculation, we want the
* rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
* standard indicates that if the adjustment parameter is unsigned, the
* stride parameter will be promoted to unsigned, causing errors when
* the lhs is a larger type than the rhs.
*/
img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
img->stride[AOM_PLANE_U];
img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
img->stride[AOM_PLANE_V];
img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
img->planes[AOM_PLANE_ALPHA] +=
(signed)(img->d_h - 1) * img->stride[AOM_PLANE_ALPHA];
img->stride[AOM_PLANE_ALPHA] = -img->stride[AOM_PLANE_ALPHA];
}
void aom_img_free(aom_image_t *img) {
if (img) {
if (img->img_data && img->img_data_owner) aom_free(img->img_data);
if (img->self_allocd) free(img);
}
}

View File

@@ -1,72 +0,0 @@
/*
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include <stdlib.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
char whiteclamp[16], char bothclamp[16],
unsigned int width, unsigned int height, int pitch) {
unsigned int i, j;
for (i = 0; i < height; ++i) {
uint8_t *pos = start + i * pitch;
char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT
for (j = 0; j < width; ++j) {
int v = pos[j];
v = clamp(v - blackclamp[0], 0, 255);
v = clamp(v + bothclamp[0], 0, 255);
v = clamp(v - whiteclamp[0], 0, 255);
pos[j] = v + ref[j];
}
}
}
static double gaussian(double sigma, double mu, double x) {
return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
(exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
}
int aom_setup_noise(double sigma, int size, char *noise) {
char char_dist[256];
int next = 0, i, j;
// set up a 256 entry lookup that matches gaussian distribution
for (i = -32; i < 32; ++i) {
const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
if (a_i) {
for (j = 0; j < a_i; ++j) {
char_dist[next + j] = (char)i;
}
next = next + j;
}
}
// Rounding error - might mean we have less than 256.
for (; next < 256; ++next) {
char_dist[next] = 0;
}
for (i = 0; i < size; ++i) {
noise[i] = char_dist[rand() & 0xff]; // NOLINT
}
// Returns the highest non 0 value used in distribution.
return -char_dist[0];
}

View File

@@ -1,64 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_dsp/ans.h"
#include "aom_dsp/prob.h"
static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
int largest_idx = -1;
int largest_p = -1;
int i;
for (i = 0; i < num_syms; ++i) {
int p = pdf_tab[i];
if (p > largest_p) {
largest_p = p;
largest_idx = i;
}
}
return largest_idx;
}
void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
const AnsP8 node_prob,
const aom_cdf_prob *const src_pdf, int in_syms) {
int i;
int adjustment = RANS_PRECISION;
const int round_fact = ANS_P8_PRECISION >> 1;
const AnsP8 p1 = ANS_P8_PRECISION - node_prob;
const int out_syms = in_syms + 1;
assert(src_pdf != out_pdf);
out_pdf[0] = node_prob << (RANS_PROB_BITS - ANS_P8_SHIFT);
adjustment -= out_pdf[0];
for (i = 0; i < in_syms; ++i) {
int p = (p1 * src_pdf[i] + round_fact) >> ANS_P8_SHIFT;
p = AOMMIN(p, (int)RANS_PRECISION - in_syms);
p = AOMMAX(p, 1);
out_pdf[i + 1] = p;
adjustment -= p;
}
// Adjust probabilities so they sum to the total probability
if (adjustment > 0) {
i = find_largest(out_pdf, out_syms);
out_pdf[i] += adjustment;
} else {
while (adjustment < 0) {
i = find_largest(out_pdf, out_syms);
--out_pdf[i];
assert(out_pdf[i] > 0);
adjustment++;
}
}
}

View File

@@ -1,44 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_ANS_H_
#define AOM_DSP_ANS_H_
// Constants, types and utilities for Asymmetric Numeral Systems
// http://arxiv.org/abs/1311.2540v2
#include <assert.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_dsp/prob.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
typedef uint8_t AnsP8;
#define ANS_P8_PRECISION 256u
#define ANS_P8_SHIFT 8
#define RANS_PROB_BITS 15
#define RANS_PRECISION (1u << RANS_PROB_BITS)
// L_BASE % PRECISION must be 0. Increasing L_BASE beyond 2**15 will cause uabs
// to overflow.
#define L_BASE (RANS_PRECISION)
#define IO_BASE 256
// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
const AnsP8 node_prob,
const aom_cdf_prob *const src_pdf, int in_syms);
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // AOM_DSP_ANS_H_

View File

@@ -1,146 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_ANSREADER_H_
#define AOM_DSP_ANSREADER_H_
// A uABS and rANS decoder implementation of Asymmetric Numeral Systems
// http://arxiv.org/abs/1311.2540v2
#include <assert.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_dsp/prob.h"
#include "aom_dsp/ans.h"
#include "aom_ports/mem_ops.h"
#if CONFIG_ACCOUNTING
#include "av1/common/accounting.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
struct AnsDecoder {
const uint8_t *buf;
int buf_offset;
uint32_t state;
#if CONFIG_ACCOUNTING
Accounting *accounting;
#endif
};
static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
AnsP8 p = ANS_P8_PRECISION - p0;
int s;
unsigned xp, sp;
unsigned state = ans->state;
while (state < L_BASE && ans->buf_offset > 0) {
state = state * IO_BASE + ans->buf[--ans->buf_offset];
}
sp = state * p;
xp = sp / ANS_P8_PRECISION;
s = (sp & 0xFF) >= p0;
if (s)
ans->state = xp;
else
ans->state = state - xp;
return s;
}
static INLINE int uabs_read_bit(struct AnsDecoder *ans) {
int s;
unsigned state = ans->state;
while (state < L_BASE && ans->buf_offset > 0) {
state = state * IO_BASE + ans->buf[--ans->buf_offset];
}
s = (int)(state & 1);
ans->state = state >> 1;
return s;
}
struct rans_dec_sym {
uint8_t val;
aom_cdf_prob prob;
aom_cdf_prob cum_prob; // not-inclusive
};
static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
aom_cdf_prob rem) {
int i;
aom_cdf_prob cum_prob = 0, top_prob;
// TODO(skal): if critical, could be a binary search.
// Or, better, an O(1) alias-table.
for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
cum_prob = top_prob;
}
out->val = i;
out->prob = top_prob - cum_prob;
out->cum_prob = cum_prob;
}
static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
unsigned rem;
unsigned quo;
struct rans_dec_sym sym;
while (ans->state < L_BASE && ans->buf_offset > 0) {
ans->state = ans->state * IO_BASE + ans->buf[--ans->buf_offset];
}
quo = ans->state / RANS_PRECISION;
rem = ans->state % RANS_PRECISION;
fetch_sym(&sym, tab, rem);
ans->state = quo * sym.prob + rem - sym.cum_prob;
return sym.val;
}
static INLINE int ans_read_init(struct AnsDecoder *const ans,
const uint8_t *const buf, int offset) {
unsigned x;
if (offset < 1) return 1;
ans->buf = buf;
x = buf[offset - 1] >> 6;
if (x == 0) {
ans->buf_offset = offset - 1;
ans->state = buf[offset - 1] & 0x3F;
} else if (x == 1) {
if (offset < 2) return 1;
ans->buf_offset = offset - 2;
ans->state = mem_get_le16(buf + offset - 2) & 0x3FFF;
} else if (x == 2) {
if (offset < 3) return 1;
ans->buf_offset = offset - 3;
ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
} else if ((buf[offset - 1] & 0xE0) == 0xE0) {
if (offset < 4) return 1;
ans->buf_offset = offset - 4;
ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
} else {
// 110xxxxx implies this byte is a superframe marker
return 1;
}
#if CONFIG_ACCOUNTING
ans->accounting = NULL;
#endif
ans->state += L_BASE;
if (ans->state >= L_BASE * IO_BASE) return 1;
return 0;
}
static INLINE int ans_read_end(struct AnsDecoder *const ans) {
return ans->state == L_BASE;
}
static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
return ans->state < L_BASE && ans->buf_offset == 0;
}
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // AOM_DSP_ANSREADER_H_

View File

@@ -1,120 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_ANSWRITER_H_
#define AOM_DSP_ANSWRITER_H_
// A uABS and rANS encoder implementation of Asymmetric Numeral Systems
// http://arxiv.org/abs/1311.2540v2
#include <assert.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_dsp/ans.h"
#include "aom_dsp/prob.h"
#include "aom_ports/mem_ops.h"
#include "av1/common/odintrin.h"
#if RANS_PRECISION <= OD_DIVU_DMAX
#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
do { \
quotient = OD_DIVU_SMALL((dividend), (divisor)); \
remainder = (dividend) - (quotient) * (divisor); \
} while (0)
#else
#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
do { \
quotient = (dividend) / (divisor); \
remainder = (dividend) % (divisor); \
} while (0)
#endif
#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
struct AnsCoder {
uint8_t *buf;
int buf_offset;
uint32_t state;
};
static INLINE void ans_write_init(struct AnsCoder *const ans,
uint8_t *const buf) {
ans->buf = buf;
ans->buf_offset = 0;
ans->state = L_BASE;
}
static INLINE int ans_write_end(struct AnsCoder *const ans) {
uint32_t state;
assert(ans->state >= L_BASE);
assert(ans->state < L_BASE * IO_BASE);
state = ans->state - L_BASE;
if (state < (1 << 6)) {
ans->buf[ans->buf_offset] = (0x00 << 6) + state;
return ans->buf_offset + 1;
} else if (state < (1 << 14)) {
mem_put_le16(ans->buf + ans->buf_offset, (0x01 << 14) + state);
return ans->buf_offset + 2;
} else if (state < (1 << 22)) {
mem_put_le24(ans->buf + ans->buf_offset, (0x02 << 22) + state);
return ans->buf_offset + 3;
} else if (state < (1 << 29)) {
mem_put_le32(ans->buf + ans->buf_offset, (0x07 << 29) + state);
return ans->buf_offset + 4;
} else {
assert(0 && "State is too large to be serialized");
return ans->buf_offset;
}
}
// uABS with normalization
static INLINE void uabs_write(struct AnsCoder *ans, int val, AnsP8 p0) {
AnsP8 p = ANS_P8_PRECISION - p0;
const unsigned l_s = val ? p : p0;
while (ans->state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
ans->state /= IO_BASE;
}
if (!val)
ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
else
ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
}
struct rans_sym {
aom_cdf_prob prob;
aom_cdf_prob cum_prob; // not-inclusive
};
// rANS with normalization
// sym->prob takes the place of l_s from the paper
// ANS_P10_PRECISION is m
static INLINE void rans_write(struct AnsCoder *ans,
const struct rans_sym *const sym) {
const aom_cdf_prob p = sym->prob;
unsigned quot, rem;
while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
ans->state /= IO_BASE;
}
ANS_DIVREM(quot, rem, ans->state, p);
ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
}
#undef ANS_DIV8
#undef ANS_DIVREM
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // AOM_DSP_ANSWRITER_H_

View File

@@ -1,600 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include <string.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_ports/mem.h"
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h) {
int x, y;
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h) {
int x, y;
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = ROUND_POWER_OF_TWO(
dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h) {
int x, y;
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h) {
int x, y;
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] = ROUND_POWER_OF_TWO(
dst[y * dst_stride] +
clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
1);
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *const x_filters,
int x0_q4, int x_step_q4,
const InterpKernel *const y_filters, int y0_q4,
int y_step_q4, int w, int h) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
// Deriving the maximum number of rows in the temp buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
// original frame (in 1/16th pixel units).
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
intermediate_height);
convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
dst_stride, y_filters, y0_q4, y_step_q4, w, h);
}
static const InterpKernel *get_filter_base(const int16_t *filter) {
// NOTE: This assumes that the filter table is 256-byte aligned.
// TODO(agrange) Modify to make independent of table alignment.
return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
}
static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
return (int)((const InterpKernel *)(intptr_t)f - base);
}
void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
w, h);
}
void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
x_step_q4, w, h);
}
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
w, h);
}
void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
y_step_q4, w, h);
}
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
filters_y, y0_q4, y_step_q4, w, h);
}
void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Fixed size intermediate buffer places limits on parameters. */
DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
h);
}
void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int filter_x_stride, const int16_t *filter_y,
int filter_y_stride, int w, int h) {
int r;
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
for (r = h; r > 0; --r) {
memcpy(dst, src, w);
src += src_stride;
dst += dst_stride;
}
}
void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int filter_x_stride, const int16_t *filter_y,
int filter_y_stride, int w, int h) {
int x, y;
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
src += src_stride;
dst += dst_stride;
}
}
void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
}
void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
}
void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
}
void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h);
}
void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h);
}
void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
filter_y, y_step_q4, w, h);
}
#if CONFIG_AOM_HIGHBITDEPTH
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
int x_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= SUBPEL_TAPS / 2 - 1;
for (y = 0; y < h; ++y) {
int x_q4 = x0_q4;
for (x = 0; x < w; ++x) {
const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
dst[x] = ROUND_POWER_OF_TWO(
dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1);
x_q4 += x_step_q4;
}
src += src_stride;
dst += dst_stride;
}
}
static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] =
clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
int y_step_q4, int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
src -= src_stride * (SUBPEL_TAPS / 2 - 1);
for (x = 0; x < w; ++x) {
int y_q4 = y0_q4;
for (y = 0; y < h; ++y) {
const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
int k, sum = 0;
for (k = 0; k < SUBPEL_TAPS; ++k)
sum += src_y[k * src_stride] * y_filter[k];
dst[y * dst_stride] = ROUND_POWER_OF_TWO(
dst[y * dst_stride] +
clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1);
y_q4 += y_step_q4;
}
++src;
++dst;
}
}
static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *const x_filters, int x0_q4,
int x_step_q4, const InterpKernel *const y_filters,
int y0_q4, int y_step_q4, int w, int h, int bd) {
// Note: Fixed size intermediate buffer, temp, places limits on parameters.
// 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
// Deriving the maximum number of rows in the temp buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
// original frame (in 1/16th pixel units).
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
x_step_q4, w, intermediate_height, bd);
highbd_convolve_vert(
CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
}
void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
x_step_q4, w, h, bd);
}
void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
(void)filter_y;
(void)y_step_q4;
highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
x_step_q4, w, h, bd);
}
void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
y_step_q4, w, h, bd);
}
void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd) {
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
(void)filter_x;
(void)x_step_q4;
highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
y_step_q4, w, h, bd);
}
void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
const InterpKernel *const filters_x = get_filter_base(filter_x);
const int x0_q4 = get_filter_offset(filter_x, filters_x);
const InterpKernel *const filters_y = get_filter_base(filter_y);
const int y0_q4 = get_filter_offset(filter_y, filters_y);
highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
filters_y, y0_q4, y_step_q4, w, h, bd);
}
void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h, int bd) {
// Fixed size intermediate buffer places limits on parameters.
DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
assert(w <= MAX_SB_SIZE);
assert(h <= MAX_SB_SIZE);
aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
dst_stride, NULL, 0, NULL, 0, w, h, bd);
}
void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int r;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
(void)filter_y_stride;
(void)bd;
for (r = h; r > 0; --r) {
memcpy(dst, src, w * sizeof(uint16_t));
src += src_stride;
dst += dst_stride;
}
}
void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h, int bd) {
int x, y;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
(void)filter_x;
(void)filter_y;
(void)filter_x_stride;
(void)filter_y_stride;
(void)bd;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
}
src += src_stride;
dst += dst_stride;
}
}
#endif

View File

@@ -1,57 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_AOM_CONVOLVE_H_
#define AOM_DSP_AOM_CONVOLVE_H_
#include "./aom_config.h"
#include "aom/aom_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
// Note: Fixed size intermediate buffers, place limits on parameters
// of some functions. 2d filtering proceeds in 2 steps:
// (1) Interpolate horizontally into an intermediate buffer, temp.
// (2) Interpolate temp vertically to derive the sub-pixel result.
// Deriving the maximum number of rows in the temp buffer (135):
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
// --Largest block size is 64x64 pixels.
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
// original frame (in 1/16th pixel units).
// --Must round-up because block may be located at sub-pixel position.
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
#if CONFIG_AV1 && CONFIG_EXT_PARTITION
#define MAX_EXT_SIZE 263
#else
#define MAX_EXT_SIZE 135
#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h);
#if CONFIG_AOM_HIGHBITDEPTH
typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int bd);
#endif
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_AOM_CONVOLVE_H_

View File

@@ -1,426 +0,0 @@
##
## Copyright (c) 2016, Alliance for Open Media. All rights reserved
##
## This source code is subject to the terms of the BSD 2 Clause License and
## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
## was not distributed with this source code in the LICENSE file, you can
## obtain it at www.aomedia.org/license/software. If the Alliance for Open
## Media Patent License 1.0 was not distributed with this source code in the
## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
##
DSP_SRCS-yes += aom_dsp.mk
DSP_SRCS-yes += aom_dsp_common.h
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms.h
# bit reader
DSP_SRCS-yes += prob.h
DSP_SRCS-yes += prob.c
DSP_SRCS-$(CONFIG_ANS) += ans.h
DSP_SRCS-$(CONFIG_ANS) += ans.c
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-$(CONFIG_ANS) += answriter.h
DSP_SRCS-yes += bitwriter.h
DSP_SRCS-yes += dkboolwriter.h
DSP_SRCS-yes += dkboolwriter.c
DSP_SRCS-yes += bitwriter_buffer.c
DSP_SRCS-yes += bitwriter_buffer.h
DSP_SRCS-yes += psnr.c
DSP_SRCS-yes += psnr.h
DSP_SRCS-$(CONFIG_ANS) += buf_ans.h
DSP_SRCS-$(CONFIG_ANS) += buf_ans.c
DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
endif
ifeq ($(CONFIG_DECODERS),yes)
DSP_SRCS-$(CONFIG_ANS) += ansreader.h
DSP_SRCS-yes += bitreader.h
DSP_SRCS-yes += dkboolreader.h
DSP_SRCS-yes += dkboolreader.c
DSP_SRCS-yes += bitreader_buffer.c
DSP_SRCS-yes += bitreader_buffer.h
endif
# intra predictions
DSP_SRCS-yes += intrapred.c
ifeq ($(CONFIG_DAALA_EC),yes)
DSP_SRCS-yes += entenc.c
DSP_SRCS-yes += entenc.h
DSP_SRCS-yes += entdec.c
DSP_SRCS-yes += entdec.h
DSP_SRCS-yes += entcode.c
DSP_SRCS-yes += entcode.h
DSP_SRCS-yes += daalaboolreader.c
DSP_SRCS-yes += daalaboolreader.h
DSP_SRCS-yes += daalaboolwriter.c
DSP_SRCS-yes += daalaboolwriter.h
endif
DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
endif # CONFIG_AOM_HIGHBITDEPTH
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c
# inter predictions
DSP_SRCS-yes += blend.h
DSP_SRCS-yes += blend_a64_mask.c
DSP_SRCS-yes += blend_a64_hmask.c
DSP_SRCS-yes += blend_a64_vmask.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
# interpolation filters
DSP_SRCS-yes += aom_convolve.c
DSP_SRCS-yes += aom_convolve.h
DSP_SRCS-yes += aom_filter.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c
DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_8t_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_bilinear_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm
DSP_SRCS-$(HAVE_AVX2) += x86/aom_subpixel_8t_intrin_avx2.c
DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_8t_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_bilinear_sse2.asm
endif
DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM)
DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM)
DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM)
DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM)
DSP_SRCS-yes += arm/aom_convolve_neon.c
else
ifeq ($(HAVE_NEON),yes)
DSP_SRCS-yes += arm/aom_convolve_copy_neon.c
DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c
DSP_SRCS-yes += arm/aom_convolve8_neon.c
DSP_SRCS-yes += arm/aom_convolve_avg_neon.c
DSP_SRCS-yes += arm/aom_convolve_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
# common (msa)
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h
# common (dspr2)
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_horiz_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_horiz_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_vert_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_horiz_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c
# loop filters
DSP_SRCS-yes += loopfilter.c
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM)
DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
DSP_SRCS-yes += arm/loopfilter_16_neon.c
DSP_SRCS-yes += arm/loopfilter_8_neon.c
DSP_SRCS-yes += arm/loopfilter_4_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_16_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_8_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_4_msa.c
DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_macros_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c
endif # CONFIG_AOM_HIGHBITDEPTH
DSP_SRCS-yes += txfm_common.h
DSP_SRCS-yes += x86/txfm_common_intrin.h
DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h
DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h
# forward transform
ifeq ($(CONFIG_AV1),yes)
DSP_SRCS-yes += fwd_txfm.c
DSP_SRCS-yes += fwd_txfm.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32_8cols_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
endif
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.h
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/txfm_common_avx2.h
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c
endif # CONFIG_AV1_ENCODER
ifeq ($(CONFIG_PVQ),yes)
DSP_SRCS-yes += fwd_txfm.c
DSP_SRCS-yes += fwd_txfm.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
endif
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c
endif # CONFIG_PVQ
# inverse transform
ifeq ($(CONFIG_AV1), yes)
DSP_SRCS-yes += inv_txfm.h
DSP_SRCS-yes += inv_txfm.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm
endif # ARCH_X86_64
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/save_reg_neon$(ASM)
DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
DSP_SRCS-yes += arm/idct4x4_1_add_neon.c
DSP_SRCS-yes += arm/idct4x4_add_neon.c
DSP_SRCS-yes += arm/idct8x8_1_add_neon.c
DSP_SRCS-yes += arm/idct8x8_add_neon.c
DSP_SRCS-yes += arm/idct16x16_1_add_neon.c
DSP_SRCS-yes += arm/idct16x16_add_neon.c
DSP_SRCS-yes += arm/idct32x32_1_add_neon.c
DSP_SRCS-yes += arm/idct32x32_add_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c
ifneq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
endif # CONFIG_AOM_HIGHBITDEPTH
endif # CONFIG_AV1
# quantization
ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
DSP_SRCS-yes += quantize.c
DSP_SRCS-yes += quantize.h
DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
endif
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
endif
# avg
DSP_SRCS-yes += avg.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
endif
# high bit depth subtract
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subtract_sse2.c
endif
endif # CONFIG_AV1_ENCODER
ifeq ($(CONFIG_AV1_ENCODER),yes)
DSP_SRCS-yes += sum_squares.c
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
endif # CONFIG_AV1_ENCODER
ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm
DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
ifeq ($(CONFIG_AV1_ENCODER),yes)
ifeq ($(CONFIG_EXT_INTER),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c
DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c
endif #CONFIG_EXT_INTER
ifeq ($(CONFIG_MOTION_VAR),yes)
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
endif #CONFIG_MOTION_VAR
endif #CONFIG_AV1_ENCODER
DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
endif # CONFIG_AOM_HIGHBITDEPTH
endif # CONFIG_ENCODERS
ifneq ($(filter yes,$(CONFIG_ENCODERS)),)
DSP_SRCS-yes += variance.c
DSP_SRCS-yes += variance.h
DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM)
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c
DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3
DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm
endif # ARCH_X86_64
DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3
ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
endif # CONFIG_AOM_HIGHBITDEPTH
endif # CONFIG_ENCODERS
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
DSP_SRCS-yes += aom_dsp_rtcd.c
DSP_SRCS-yes += aom_dsp_rtcd_defs.pl
DSP_SRCS-yes += aom_simd.c
DSP_SRCS-yes += aom_simd.h
DSP_SRCS-yes += aom_simd_inline.h
DSP_SRCS-yes += simd/v64_intrinsics.h
DSP_SRCS-yes += simd/v64_intrinsics_c.h
DSP_SRCS-yes += simd/v128_intrinsics.h
DSP_SRCS-yes += simd/v128_intrinsics_c.h
DSP_SRCS-yes += simd/v256_intrinsics.h
DSP_SRCS-yes += simd/v256_intrinsics_c.h
DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))

View File

@@ -1,102 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_AOM_DSP_COMMON_H_
#define AOM_DSP_AOM_DSP_COMMON_H_
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifndef MAX_SB_SIZE
#if CONFIG_AV1 && CONFIG_EXT_PARTITION
#define MAX_SB_SIZE 128
#else
#define MAX_SB_SIZE 64
#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
#endif // ndef MAX_SB_SIZE
#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b')
#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
// These can be used to give a hint about branch outcomes.
// This can have an effect, even if your target processor has a
// good branch predictor, as these hints can affect basic block
// ordering by the compiler.
#ifdef __GNUC__
#define LIKELY(v) __builtin_expect(v, 1)
#define UNLIKELY(v) __builtin_expect(v, 0)
#else
#define LIKELY(v) (v)
#define UNLIKELY(v) (v)
#endif
#define AOM_SWAP(type, a, b) \
do { \
type c = (b); \
b = a; \
a = c; \
} while (0)
#if CONFIG_AOM_QM
typedef uint16_t qm_val_t;
#define AOM_QM_BITS 6
#endif
#if CONFIG_AOM_HIGHBITDEPTH
// Note:
// tran_low_t is the datatype used for final transform coefficients.
// tran_high_t is the datatype used for intermediate transform stages.
typedef int64_t tran_high_t;
typedef int32_t tran_low_t;
#else
// Note:
// tran_low_t is the datatype used for final transform coefficients.
// tran_high_t is the datatype used for intermediate transform stages.
typedef int32_t tran_high_t;
typedef int16_t tran_low_t;
#endif // CONFIG_AOM_HIGHBITDEPTH
static INLINE uint8_t clip_pixel(int val) {
return (val > 255) ? 255 : (val < 0) ? 0 : val;
}
static INLINE int clamp(int value, int low, int high) {
return value < low ? low : (value > high ? high : value);
}
static INLINE double fclamp(double value, double low, double high) {
return value < low ? low : (value > high ? high : value);
}
#if CONFIG_AOM_HIGHBITDEPTH
static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
switch (bd) {
case 8:
default: return (uint16_t)clamp(val, 0, 255);
case 10: return (uint16_t)clamp(val, 0, 1023);
case 12: return (uint16_t)clamp(val, 0, 4095);
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_AOM_DSP_COMMON_H_

View File

@@ -1,16 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_config.h"
#define RTCD_C
#include "./aom_dsp_rtcd.h"
#include "aom_ports/aom_once.h"
void aom_dsp_rtcd() { once(setup_rtcd_internal); }

File diff suppressed because it is too large Load Diff

View File

@@ -1,43 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_AOM_FILTER_H_
#define AOM_DSP_AOM_FILTER_H_
#include "aom/aom_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
#define FILTER_BITS 7
#define SUBPEL_BITS 4
#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
#define SUBPEL_TAPS 8
typedef int16_t InterpKernel[SUBPEL_TAPS];
#define BIL_SUBPEL_BITS 3
#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
// 2 tap bilinear filters
static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_AOM_FILTER_H_

View File

@@ -1,13 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// Set to 1 to add some sanity checks in the fallback C code
const int simd_check = 1;

View File

@@ -1,32 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_AOM_AOM_SIMD_H_
#define AOM_DSP_AOM_AOM_SIMD_H_
#include <stdint.h>
#if defined(_WIN32)
#include <intrin.h>
#endif
#include "./aom_config.h"
#include "./aom_simd_inline.h"
#if HAVE_NEON
#include "simd/v256_intrinsics_arm.h"
#elif HAVE_SSE2
#include "simd/v256_intrinsics_x86.h"
#else
#include "simd/v256_intrinsics.h"
#endif
#endif // AOM_DSP_AOM_AOM_SIMD_H_

View File

@@ -1,21 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_AOM_SIMD_INLINE_H_
#define AOM_DSP_AOM_SIMD_INLINE_H_
#include "aom/aom_integer.h"
#ifndef SIMD_INLINE
#define SIMD_INLINE static AOM_FORCE_INLINE
#endif
#endif // AOM_DSP_AOM_SIMD_INLINE_H_

View File

@@ -1,364 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
int16x4_t dsrc2, int16x4_t dsrc3,
int16x4_t dsrc4, int16x4_t dsrc5,
int16x4_t dsrc6, int16x4_t dsrc7,
int16x8_t q0s16) {
int32x4_t qdst;
int16x4_t d0s16, d1s16;
d0s16 = vget_low_s16(q0s16);
d1s16 = vget_high_s16(q0s16);
qdst = vmull_lane_s16(dsrc0, d0s16, 0);
qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
return qdst;
}
void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
int w, int h) {
int width;
const uint8_t *s;
uint8_t *d;
uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
int16x4_t d24s16, d25s16, d26s16, d27s16;
uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
int16x8_t q0s16;
uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
int32x4_t q1s32, q2s32, q14s32, q15s32;
uint16x8x2_t q0x2u16;
uint8x8x2_t d0x2u8, d1x2u8;
uint32x2x2_t d0x2u32;
uint16x4x2_t d0x2u16, d1x2u16;
uint32x4x2_t q0x2u32;
assert(x_step_q4 == 16);
(void)x_step_q4;
(void)y_step_q4;
(void)filter_y;
q0s16 = vld1q_s16(filter_x);
src -= 3; // adjust for taps
for (; h > 0; h -= 4) { // loop_horiz_v
s = src;
d24u8 = vld1_u8(s);
s += src_stride;
d25u8 = vld1_u8(s);
s += src_stride;
d26u8 = vld1_u8(s);
s += src_stride;
d27u8 = vld1_u8(s);
q12u8 = vcombine_u8(d24u8, d25u8);
q13u8 = vcombine_u8(d26u8, d27u8);
q0x2u16 =
vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
d0x2u8 = vtrn_u8(d24u8, d25u8);
d1x2u8 = vtrn_u8(d26u8, d27u8);
__builtin_prefetch(src + src_stride * 4);
__builtin_prefetch(src + src_stride * 5);
q8u16 = vmovl_u8(d0x2u8.val[0]);
q9u16 = vmovl_u8(d0x2u8.val[1]);
q10u16 = vmovl_u8(d1x2u8.val[0]);
q11u16 = vmovl_u8(d1x2u8.val[1]);
src += 7;
d16u16 = vget_low_u16(q8u16);
d17u16 = vget_high_u16(q8u16);
d18u16 = vget_low_u16(q9u16);
d19u16 = vget_high_u16(q9u16);
q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
q9u16 = vcombine_u16(d17u16, d19u16);
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz
s = src;
d28u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride;
d29u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride;
d31u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride;
d30u32 = vld1_dup_u32((const uint32_t *)s);
__builtin_prefetch(src + 64);
d0x2u16 =
vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
d1x2u16 =
vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
vreinterpret_u8_u16(d1x2u16.val[0])); // d29
d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
vreinterpret_u8_u16(d1x2u16.val[1])); // d30
__builtin_prefetch(src + 64 + src_stride);
q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
q0x2u32 =
vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
q12u16 = vmovl_u8(d28u8);
q13u16 = vmovl_u8(d29u8);
__builtin_prefetch(src + 64 + src_stride * 2);
d = dst;
d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
d += dst_stride;
d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
d += dst_stride;
d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
d += dst_stride;
d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
d23s16, d24s16, q0s16);
q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
d24s16, d26s16, q0s16);
q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
d26s16, d27s16, q0s16);
q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
d27s16, d25s16, q0s16);
__builtin_prefetch(src + 64 + src_stride * 3);
d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7);
d4u16 = vqrshrun_n_s32(q14s32, 7);
d5u16 = vqrshrun_n_s32(q15s32, 7);
q1u16 = vcombine_u16(d2u16, d3u16);
q2u16 = vcombine_u16(d4u16, d5u16);
d2u8 = vqmovn_u16(q1u16);
d3u8 = vqmovn_u16(q2u16);
d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
vreinterpret_u32_u16(d0x2u16.val[1]));
d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
vreinterpret_u8_u32(d0x2u32.val[1]));
q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
q1u8 = vrhaddq_u8(q1u8, q3u8);
d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
d = dst;
vst1_lane_u32((uint32_t *)d, d2u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d2u32, 1);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 1);
q8u16 = q9u16;
d20s16 = d23s16;
q11u16 = q12u16;
q9u16 = q13u16;
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
}
src += src_stride * 4 - w - 7;
dst += dst_stride * 4 - w;
}
return;
}
void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, // unused
int x_step_q4, // unused
const int16_t *filter_y, int y_step_q4, int w,
int h) {
int height;
const uint8_t *s;
uint8_t *d;
uint8x8_t d2u8, d3u8;
uint32x2_t d2u32, d3u32, d6u32, d7u32;
uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
uint8x16_t q1u8, q3u8;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
int16x4_t d24s16, d25s16, d26s16, d27s16;
uint16x4_t d2u16, d3u16, d4u16, d5u16;
int16x8_t q0s16;
uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
int32x4_t q1s32, q2s32, q14s32, q15s32;
assert(y_step_q4 == 16);
(void)x_step_q4;
(void)y_step_q4;
(void)filter_x;
src -= src_stride * 3;
q0s16 = vld1q_s16(filter_y);
for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
s = src;
d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
s += src_stride;
d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
s += src_stride;
d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
s += src_stride;
d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
s += src_stride;
d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
s += src_stride;
d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
s += src_stride;
d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
s += src_stride;
q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d = dst;
for (height = h; height > 0; height -= 4) { // loop_vert
d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
s += src_stride;
d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
s += src_stride;
d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
s += src_stride;
d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
s += src_stride;
q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
d += dst_stride;
d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
d += dst_stride;
d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
d += dst_stride;
d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
d -= dst_stride * 3;
d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
__builtin_prefetch(s);
__builtin_prefetch(s + src_stride);
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
d22s16, d24s16, q0s16);
__builtin_prefetch(s + src_stride * 2);
__builtin_prefetch(s + src_stride * 3);
q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
d24s16, d26s16, q0s16);
__builtin_prefetch(d);
__builtin_prefetch(d + dst_stride);
q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
d26s16, d27s16, q0s16);
__builtin_prefetch(d + dst_stride * 2);
__builtin_prefetch(d + dst_stride * 3);
q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
d27s16, d25s16, q0s16);
d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7);
d4u16 = vqrshrun_n_s32(q14s32, 7);
d5u16 = vqrshrun_n_s32(q15s32, 7);
q1u16 = vcombine_u16(d2u16, d3u16);
q2u16 = vcombine_u16(d4u16, d5u16);
d2u8 = vqmovn_u16(q1u16);
d3u8 = vqmovn_u16(q2u16);
q1u8 = vcombine_u8(d2u8, d3u8);
q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
q1u8 = vrhaddq_u8(q1u8, q3u8);
d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
vst1_lane_u32((uint32_t *)d, d2u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d2u32, 1);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 1);
d += dst_stride;
q8u16 = q10u16;
d18s16 = d22s16;
d19s16 = d24s16;
q10u16 = q13u16;
d22s16 = d25s16;
}
}
return;
}

View File

@@ -1,331 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1,
int16x4_t dsrc2, int16x4_t dsrc3,
int16x4_t dsrc4, int16x4_t dsrc5,
int16x4_t dsrc6, int16x4_t dsrc7,
int16x8_t q0s16) {
int32x4_t qdst;
int16x4_t d0s16, d1s16;
d0s16 = vget_low_s16(q0s16);
d1s16 = vget_high_s16(q0s16);
qdst = vmull_lane_s16(dsrc0, d0s16, 0);
qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
return qdst;
}
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, // unused
int y_step_q4, // unused
int w, int h) {
int width;
const uint8_t *s, *psrc;
uint8_t *d, *pdst;
uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
uint8x16_t q12u8, q13u8, q14u8, q15u8;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
int16x4_t d24s16, d25s16, d26s16, d27s16;
uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
int16x8_t q0s16;
uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
int32x4_t q1s32, q2s32, q14s32, q15s32;
uint16x8x2_t q0x2u16;
uint8x8x2_t d0x2u8, d1x2u8;
uint32x2x2_t d0x2u32;
uint16x4x2_t d0x2u16, d1x2u16;
uint32x4x2_t q0x2u32;
assert(x_step_q4 == 16);
(void)x_step_q4;
(void)y_step_q4;
(void)filter_y;
q0s16 = vld1q_s16(filter_x);
src -= 3; // adjust for taps
for (; h > 0; h -= 4, src += src_stride * 4,
dst += dst_stride * 4) { // loop_horiz_v
s = src;
d24u8 = vld1_u8(s);
s += src_stride;
d25u8 = vld1_u8(s);
s += src_stride;
d26u8 = vld1_u8(s);
s += src_stride;
d27u8 = vld1_u8(s);
q12u8 = vcombine_u8(d24u8, d25u8);
q13u8 = vcombine_u8(d26u8, d27u8);
q0x2u16 =
vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
d0x2u8 = vtrn_u8(d24u8, d25u8);
d1x2u8 = vtrn_u8(d26u8, d27u8);
__builtin_prefetch(src + src_stride * 4);
__builtin_prefetch(src + src_stride * 5);
__builtin_prefetch(src + src_stride * 6);
q8u16 = vmovl_u8(d0x2u8.val[0]);
q9u16 = vmovl_u8(d0x2u8.val[1]);
q10u16 = vmovl_u8(d1x2u8.val[0]);
q11u16 = vmovl_u8(d1x2u8.val[1]);
d16u16 = vget_low_u16(q8u16);
d17u16 = vget_high_u16(q8u16);
d18u16 = vget_low_u16(q9u16);
d19u16 = vget_high_u16(q9u16);
q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
q9u16 = vcombine_u16(d17u16, d19u16);
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
for (width = w, psrc = src + 7, pdst = dst; width > 0;
width -= 4, psrc += 4, pdst += 4) { // loop_horiz
s = psrc;
d28u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride;
d29u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride;
d31u32 = vld1_dup_u32((const uint32_t *)s);
s += src_stride;
d30u32 = vld1_dup_u32((const uint32_t *)s);
__builtin_prefetch(psrc + 64);
d0x2u16 =
vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
d1x2u16 =
vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
vreinterpret_u8_u16(d1x2u16.val[0])); // d29
d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
vreinterpret_u8_u16(d1x2u16.val[1])); // d30
__builtin_prefetch(psrc + 64 + src_stride);
q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
q0x2u32 =
vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));
d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
q12u16 = vmovl_u8(d28u8);
q13u16 = vmovl_u8(d29u8);
__builtin_prefetch(psrc + 64 + src_stride * 2);
d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
d23s16, d24s16, q0s16);
q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
d24s16, d26s16, q0s16);
q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
d26s16, d27s16, q0s16);
q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
d27s16, d25s16, q0s16);
__builtin_prefetch(psrc + 60 + src_stride * 3);
d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7);
d4u16 = vqrshrun_n_s32(q14s32, 7);
d5u16 = vqrshrun_n_s32(q15s32, 7);
q1u16 = vcombine_u16(d2u16, d3u16);
q2u16 = vcombine_u16(d4u16, d5u16);
d2u8 = vqmovn_u16(q1u16);
d3u8 = vqmovn_u16(q2u16);
d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
vreinterpret_u32_u16(d0x2u16.val[1]));
d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
vreinterpret_u8_u32(d0x2u32.val[1]));
d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
d = pdst;
vst1_lane_u32((uint32_t *)d, d2u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d2u32, 1);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 1);
q8u16 = q9u16;
d20s16 = d23s16;
q11u16 = q12u16;
q9u16 = q13u16;
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
}
}
return;
}
void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, // unused
int x_step_q4, // unused
const int16_t *filter_y, int y_step_q4, int w,
int h) {
int height;
const uint8_t *s;
uint8_t *d;
uint32x2_t d2u32, d3u32;
uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
int16x4_t d24s16, d25s16, d26s16, d27s16;
uint16x4_t d2u16, d3u16, d4u16, d5u16;
int16x8_t q0s16;
uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
int32x4_t q1s32, q2s32, q14s32, q15s32;
assert(y_step_q4 == 16);
(void)x_step_q4;
(void)y_step_q4;
(void)filter_x;
src -= src_stride * 3;
q0s16 = vld1q_s16(filter_y);
for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
s = src;
d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
s += src_stride;
d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
s += src_stride;
d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
s += src_stride;
d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
s += src_stride;
d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
s += src_stride;
d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
s += src_stride;
d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
s += src_stride;
q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d = dst;
for (height = h; height > 0; height -= 4) { // loop_vert
d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
s += src_stride;
d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
s += src_stride;
d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
s += src_stride;
d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
s += src_stride;
q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
__builtin_prefetch(d);
__builtin_prefetch(d + dst_stride);
q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16,
d22s16, d24s16, q0s16);
__builtin_prefetch(d + dst_stride * 2);
__builtin_prefetch(d + dst_stride * 3);
q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16,
d24s16, d26s16, q0s16);
__builtin_prefetch(s);
__builtin_prefetch(s + src_stride);
q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16,
d26s16, d27s16, q0s16);
__builtin_prefetch(s + src_stride * 2);
__builtin_prefetch(s + src_stride * 3);
q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16,
d27s16, d25s16, q0s16);
d2u16 = vqrshrun_n_s32(q1s32, 7);
d3u16 = vqrshrun_n_s32(q2s32, 7);
d4u16 = vqrshrun_n_s32(q14s32, 7);
d5u16 = vqrshrun_n_s32(q15s32, 7);
q1u16 = vcombine_u16(d2u16, d3u16);
q2u16 = vcombine_u16(d4u16, d5u16);
d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
vst1_lane_u32((uint32_t *)d, d2u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d2u32, 1);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 0);
d += dst_stride;
vst1_lane_u32((uint32_t *)d, d3u32, 1);
d += dst_stride;
q8u16 = q10u16;
d18s16 = d22s16;
d19s16 = d24s16;
q10u16 = q13u16;
d22s16 = d25s16;
}
}
return;
}

View File

@@ -1,145 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
void aom_convolve_avg_neon(const uint8_t *src, // r0
ptrdiff_t src_stride, // r1
uint8_t *dst, // r2
ptrdiff_t dst_stride, // r3
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride, int w,
int h) {
uint8_t *d;
uint8x8_t d0u8, d1u8, d2u8, d3u8;
uint32x2_t d0u32, d2u32;
uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
d = dst;
if (w > 32) { // avg64
for (; h > 0; h -= 1) {
q0u8 = vld1q_u8(src);
q1u8 = vld1q_u8(src + 16);
q2u8 = vld1q_u8(src + 32);
q3u8 = vld1q_u8(src + 48);
src += src_stride;
q8u8 = vld1q_u8(d);
q9u8 = vld1q_u8(d + 16);
q10u8 = vld1q_u8(d + 32);
q11u8 = vld1q_u8(d + 48);
d += dst_stride;
q0u8 = vrhaddq_u8(q0u8, q8u8);
q1u8 = vrhaddq_u8(q1u8, q9u8);
q2u8 = vrhaddq_u8(q2u8, q10u8);
q3u8 = vrhaddq_u8(q3u8, q11u8);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q1u8);
vst1q_u8(dst + 32, q2u8);
vst1q_u8(dst + 48, q3u8);
dst += dst_stride;
}
} else if (w == 32) { // avg32
for (; h > 0; h -= 2) {
q0u8 = vld1q_u8(src);
q1u8 = vld1q_u8(src + 16);
src += src_stride;
q2u8 = vld1q_u8(src);
q3u8 = vld1q_u8(src + 16);
src += src_stride;
q8u8 = vld1q_u8(d);
q9u8 = vld1q_u8(d + 16);
d += dst_stride;
q10u8 = vld1q_u8(d);
q11u8 = vld1q_u8(d + 16);
d += dst_stride;
q0u8 = vrhaddq_u8(q0u8, q8u8);
q1u8 = vrhaddq_u8(q1u8, q9u8);
q2u8 = vrhaddq_u8(q2u8, q10u8);
q3u8 = vrhaddq_u8(q3u8, q11u8);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q1u8);
dst += dst_stride;
vst1q_u8(dst, q2u8);
vst1q_u8(dst + 16, q3u8);
dst += dst_stride;
}
} else if (w > 8) { // avg16
for (; h > 0; h -= 2) {
q0u8 = vld1q_u8(src);
src += src_stride;
q1u8 = vld1q_u8(src);
src += src_stride;
q2u8 = vld1q_u8(d);
d += dst_stride;
q3u8 = vld1q_u8(d);
d += dst_stride;
q0u8 = vrhaddq_u8(q0u8, q2u8);
q1u8 = vrhaddq_u8(q1u8, q3u8);
vst1q_u8(dst, q0u8);
dst += dst_stride;
vst1q_u8(dst, q1u8);
dst += dst_stride;
}
} else if (w == 8) { // avg8
for (; h > 0; h -= 2) {
d0u8 = vld1_u8(src);
src += src_stride;
d1u8 = vld1_u8(src);
src += src_stride;
d2u8 = vld1_u8(d);
d += dst_stride;
d3u8 = vld1_u8(d);
d += dst_stride;
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
q0u8 = vrhaddq_u8(q0u8, q1u8);
vst1_u8(dst, vget_low_u8(q0u8));
dst += dst_stride;
vst1_u8(dst, vget_high_u8(q0u8));
dst += dst_stride;
}
} else { // avg4
for (; h > 0; h -= 2) {
d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
src += src_stride;
d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
src += src_stride;
d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
d += dst_stride;
d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
d += dst_stride;
d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32));
d0u32 = vreinterpret_u32_u8(d0u8);
vst1_lane_u32((uint32_t *)dst, d0u32, 0);
dst += dst_stride;
vst1_lane_u32((uint32_t *)dst, d0u32, 1);
dst += dst_stride;
}
}
return;
}

View File

@@ -1,93 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
void aom_convolve_copy_neon(const uint8_t *src, // r0
ptrdiff_t src_stride, // r1
uint8_t *dst, // r2
ptrdiff_t dst_stride, // r3
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride, int w,
int h) {
uint8x8_t d0u8, d2u8;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
(void)filter_x;
(void)filter_x_stride;
(void)filter_y;
(void)filter_y_stride;
if (w > 32) { // copy64
for (; h > 0; h--) {
q0u8 = vld1q_u8(src);
q1u8 = vld1q_u8(src + 16);
q2u8 = vld1q_u8(src + 32);
q3u8 = vld1q_u8(src + 48);
src += src_stride;
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q1u8);
vst1q_u8(dst + 32, q2u8);
vst1q_u8(dst + 48, q3u8);
dst += dst_stride;
}
} else if (w == 32) { // copy32
for (; h > 0; h -= 2) {
q0u8 = vld1q_u8(src);
q1u8 = vld1q_u8(src + 16);
src += src_stride;
q2u8 = vld1q_u8(src);
q3u8 = vld1q_u8(src + 16);
src += src_stride;
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q1u8);
dst += dst_stride;
vst1q_u8(dst, q2u8);
vst1q_u8(dst + 16, q3u8);
dst += dst_stride;
}
} else if (w > 8) { // copy16
for (; h > 0; h -= 2) {
q0u8 = vld1q_u8(src);
src += src_stride;
q1u8 = vld1q_u8(src);
src += src_stride;
vst1q_u8(dst, q0u8);
dst += dst_stride;
vst1q_u8(dst, q1u8);
dst += dst_stride;
}
} else if (w == 8) { // copy8
for (; h > 0; h -= 2) {
d0u8 = vld1_u8(src);
src += src_stride;
d2u8 = vld1_u8(src);
src += src_stride;
vst1_u8(dst, d0u8);
dst += dst_stride;
vst1_u8(dst, d2u8);
dst += dst_stride;
}
} else { // copy4
for (; h > 0; h--) {
*(uint32_t *)dst = *(const uint32_t *)src;
src += src_stride;
dst += dst_stride;
}
}
return;
}

View File

@@ -1,66 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
int w, int h) {
/* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
* maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
*/
DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
// Account for the vertical phase needing 3 lines prior and 4 lines post
int intermediate_height = h + 7;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
/* Filter starting 3 lines back. The neon implementation will ignore the
* given height and filter a multiple of 4 lines. Since this goes in to
* the temp buffer which has lots of extra room and is subsequently discarded
* this is safe if somewhat less than ideal.
*/
aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
x_step_q4, filter_y, y_step_q4, w,
intermediate_height);
/* Step into the temp buffer 3 lines to get the actual frame data */
aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h);
}
void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
int intermediate_height = h + 7;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
/* This implementation has the same issues as above. In addition, we only want
* to average the values after both passes.
*/
aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
x_step_q4, filter_y, y_step_q4, w,
intermediate_height);
aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
x_step_q4, filter_y, y_step_q4, w, h);
}

View File

@@ -1,254 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include <assert.h>
#include "./aom_dsp_rtcd.h"
#include "./aom_config.h"
#include "aom/aom_integer.h"
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
const uint32x4_t a = vpaddlq_u16(v_16x8);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
return vget_lane_u32(c, 0);
}
unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) {
uint16x8_t v_sum;
uint32x2_t v_s0 = vdup_n_u32(0);
uint32x2_t v_s1 = vdup_n_u32(0);
v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
return (horizontal_add_u16x8(v_sum) + 8) >> 4;
}
unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) {
uint8x8_t v_s0 = vld1_u8(s);
const uint8x8_t v_s1 = vld1_u8(s + p);
uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
v_s0 = vld1_u8(s + 2 * p);
v_sum = vaddw_u8(v_sum, v_s0);
v_s0 = vld1_u8(s + 3 * p);
v_sum = vaddw_u8(v_sum, v_s0);
v_s0 = vld1_u8(s + 4 * p);
v_sum = vaddw_u8(v_sum, v_s0);
v_s0 = vld1_u8(s + 5 * p);
v_sum = vaddw_u8(v_sum, v_s0);
v_s0 = vld1_u8(s + 6 * p);
v_sum = vaddw_u8(v_sum, v_s0);
v_s0 = vld1_u8(s + 7 * p);
v_sum = vaddw_u8(v_sum, v_s0);
return (horizontal_add_u16x8(v_sum) + 32) >> 6;
}
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
int aom_satd_neon(const int16_t *coeff, int length) {
const int16x4_t zero = vdup_n_s16(0);
int32x4_t accum = vdupq_n_s32(0);
do {
const int16x8_t src0 = vld1q_s16(coeff);
const int16x8_t src8 = vld1q_s16(coeff + 8);
accum = vabal_s16(accum, vget_low_s16(src0), zero);
accum = vabal_s16(accum, vget_high_s16(src0), zero);
accum = vabal_s16(accum, vget_low_s16(src8), zero);
accum = vabal_s16(accum, vget_high_s16(src8), zero);
length -= 16;
coeff += 16;
} while (length != 0);
{
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'.
const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
vreinterpret_s32_s64(vget_high_s64(s0)));
const int satd = vget_lane_s32(s1, 0);
return satd;
}
}
void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
const int ref_stride, const int height) {
int i;
uint16x8_t vec_sum_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_hi = vdupq_n_u16(0);
const int shift_factor = ((height >> 5) + 3) * -1;
const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
for (i = 0; i < height; i += 8) {
const uint8x16_t vec_row1 = vld1q_u8(ref);
const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
ref += ref_stride * 8;
}
vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
hbuf += 8;
vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
}
int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) {
int i;
uint16x8_t vec_sum = vdupq_n_u16(0);
for (i = 0; i < width; i += 16) {
const uint8x16_t vec_row = vld1q_u8(ref);
vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
ref += 16;
}
return horizontal_add_u16x8(vec_sum);
}
// ref, src = [0, 510] - max diff = 16-bits
// bwl = {2, 3, 4}, width = {16, 32, 64}
int aom_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
int width = 4 << bwl;
int32x4_t sse = vdupq_n_s32(0);
int16x8_t total = vdupq_n_s16(0);
assert(width >= 8);
assert((width % 8) == 0);
do {
const int16x8_t r = vld1q_s16(ref);
const int16x8_t s = vld1q_s16(src);
const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits.
const int16x4_t diff_lo = vget_low_s16(diff);
const int16x4_t diff_hi = vget_high_s16(diff);
sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits.
sse = vmlal_s16(sse, diff_hi, diff_hi);
total = vaddq_s16(total, diff); // dynamic range 16 bits.
ref += 8;
src += 8;
width -= 8;
} while (width != 0);
{
// Note: 'total''s pairwise addition could be implemented similarly to
// horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
// with the summation of 'sse' performed better on a Cortex-A15.
const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total'
const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
const int32x2_t t2 = vpadd_s32(t1, t1);
const int t = vget_lane_s32(t2, 0);
const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'.
const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
vreinterpret_s32_s64(vget_high_s64(s0)));
const int s = vget_lane_s32(s1, 0);
const int shift_factor = bwl + 2;
return s - ((t * t) >> shift_factor);
}
}
void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int *min, int *max) {
// Load and concatenate.
const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
const uint8x16_t a23 =
vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
const uint8x16_t a45 =
vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
const uint8x16_t a67 =
vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
const uint8x16_t b23 =
vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
const uint8x16_t b45 =
vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
const uint8x16_t b67 =
vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
// Absolute difference.
const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
// Max values between the Q vectors.
const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
// Split to D and start doing pairwise.
uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
// Enough runs of vpmax/min propogate the max/min values to every position.
ab_max = vpmax_u8(ab_max, ab_max);
ab_min = vpmin_u8(ab_min, ab_min);
ab_max = vpmax_u8(ab_max, ab_max);
ab_min = vpmin_u8(ab_min, ab_min);
ab_max = vpmax_u8(ab_max, ab_max);
ab_min = vpmin_u8(ab_min, ab_min);
*min = *max = 0; // Clear high bits
// Store directly to avoid costly neon->gpr transfer.
vst1_lane_u8((uint8_t *)max, ab_max, 0);
vst1_lane_u8((uint8_t *)min, ab_min, 0);
}

View File

@@ -1,221 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "aom_dsp/txfm_common.h"
void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
int i;
// stage 1
int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
for (i = 0; i < 2; ++i) {
int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
// fdct4(step, step);
int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
// fdct4(step, step);
int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
}
// Stage 2
v_x0 = vsubq_s16(v_s6, v_s5);
v_x1 = vaddq_s16(v_s6, v_s5);
v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
const int16x8_t ab = vcombine_s16(a, b);
const int16x8_t cd = vcombine_s16(c, d);
// Stage 3
v_x0 = vaddq_s16(v_s4, ab);
v_x1 = vsubq_s16(v_s4, ab);
v_x2 = vsubq_s16(v_s7, cd);
v_x3 = vaddq_s16(v_s7, cd);
}
// Stage 4
v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
{
const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
}
// transpose 8x8
{
// 00 01 02 03 40 41 42 43
// 10 11 12 13 50 51 52 53
// 20 21 22 23 60 61 62 63
// 30 31 32 33 70 71 72 73
// 04 05 06 07 44 45 46 47
// 14 15 16 17 54 55 56 57
// 24 25 26 27 64 65 66 67
// 34 35 36 37 74 75 76 77
const int32x4x2_t r02_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
const int32x4x2_t r13_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
const int32x4x2_t r46_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
const int32x4x2_t r57_s32 =
vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
const int16x8x2_t r01_s16 =
vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
vreinterpretq_s16_s32(r13_s32.val[0]));
const int16x8x2_t r23_s16 =
vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
vreinterpretq_s16_s32(r13_s32.val[1]));
const int16x8x2_t r45_s16 =
vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
vreinterpretq_s16_s32(r57_s32.val[0]));
const int16x8x2_t r67_s16 =
vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
vreinterpretq_s16_s32(r57_s32.val[1]));
input_0 = r01_s16.val[0];
input_1 = r01_s16.val[1];
input_2 = r23_s16.val[0];
input_3 = r23_s16.val[1];
input_4 = r45_s16.val[0];
input_5 = r45_s16.val[1];
input_6 = r67_s16.val[0];
input_7 = r67_s16.val[1];
// 00 10 20 30 40 50 60 70
// 01 11 21 31 41 51 61 71
// 02 12 22 32 42 52 62 72
// 03 13 23 33 43 53 63 73
// 04 14 24 34 44 54 64 74
// 05 15 25 35 45 55 65 75
// 06 16 26 36 46 56 66 76
// 07 17 27 37 47 57 67 77
}
} // for
{
// from aom_dct_sse2.c
// Post-condition (division by two)
// division of two 16 bits signed numbers using shifts
// n / 2 = (n - (n >> 15)) >> 1
const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
input_0 = vhsubq_s16(input_0, sign_in0);
input_1 = vhsubq_s16(input_1, sign_in1);
input_2 = vhsubq_s16(input_2, sign_in2);
input_3 = vhsubq_s16(input_3, sign_in3);
input_4 = vhsubq_s16(input_4, sign_in4);
input_5 = vhsubq_s16(input_5, sign_in5);
input_6 = vhsubq_s16(input_6, sign_in6);
input_7 = vhsubq_s16(input_7, sign_in7);
// store results
vst1q_s16(&final_output[0 * 8], input_0);
vst1q_s16(&final_output[1 * 8], input_1);
vst1q_s16(&final_output[2 * 8], input_2);
vst1q_s16(&final_output[3 * 8], input_3);
vst1q_s16(&final_output[4 * 8], input_4);
vst1q_s16(&final_output[5 * 8], input_5);
vst1q_s16(&final_output[6 * 8], input_6);
vst1q_s16(&final_output[7 * 8], input_7);
}
}
void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
int r;
int16x8_t sum = vld1q_s16(&input[0]);
for (r = 1; r < 8; ++r) {
const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
sum = vaddq_s16(sum, input_00);
}
{
const int32x4_t a = vpaddlq_s16(sum);
const int64x2_t b = vpaddlq_s32(a);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
output[1] = 0;
}
}

View File

@@ -1,199 +0,0 @@
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
const int16x8_t b0 = vaddq_s16(*a0, *a1);
const int16x8_t b1 = vsubq_s16(*a0, *a1);
const int16x8_t b2 = vaddq_s16(*a2, *a3);
const int16x8_t b3 = vsubq_s16(*a2, *a3);
const int16x8_t b4 = vaddq_s16(*a4, *a5);
const int16x8_t b5 = vsubq_s16(*a4, *a5);
const int16x8_t b6 = vaddq_s16(*a6, *a7);
const int16x8_t b7 = vsubq_s16(*a6, *a7);
const int16x8_t c0 = vaddq_s16(b0, b2);
const int16x8_t c1 = vaddq_s16(b1, b3);
const int16x8_t c2 = vsubq_s16(b0, b2);
const int16x8_t c3 = vsubq_s16(b1, b3);
const int16x8_t c4 = vaddq_s16(b4, b6);
const int16x8_t c5 = vaddq_s16(b5, b7);
const int16x8_t c6 = vsubq_s16(b4, b6);
const int16x8_t c7 = vsubq_s16(b5, b7);
*a0 = vaddq_s16(c0, c4);
*a1 = vsubq_s16(c2, c6);
*a2 = vsubq_s16(c0, c4);
*a3 = vaddq_s16(c2, c6);
*a4 = vaddq_s16(c3, c7);
*a5 = vsubq_s16(c3, c7);
*a6 = vsubq_s16(c1, c5);
*a7 = vaddq_s16(c1, c5);
}
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
// reversing transpose order which may make it easier for the compiler to
// reconcile the vtrn.64 moves.
static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
int16x8_t *a6, int16x8_t *a7) {
// Swap 64 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 08 09 10 11 12 13 14 15
// a2: 16 17 18 19 20 21 22 23
// a3: 24 25 26 27 28 29 30 31
// a4: 32 33 34 35 36 37 38 39
// a5: 40 41 42 43 44 45 46 47
// a6: 48 49 50 51 52 53 54 55
// a7: 56 57 58 59 60 61 62 63
// to:
// a04_lo: 00 01 02 03 32 33 34 35
// a15_lo: 08 09 10 11 40 41 42 43
// a26_lo: 16 17 18 19 48 49 50 51
// a37_lo: 24 25 26 27 56 57 58 59
// a04_hi: 04 05 06 07 36 37 38 39
// a15_hi: 12 13 14 15 44 45 46 47
// a26_hi: 20 21 22 23 52 53 54 55
// a37_hi: 28 29 30 31 60 61 62 63
const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
// Swap 32 bit elements resulting in:
// a0246_lo:
// 00 01 16 17 32 33 48 49
// 02 03 18 19 34 35 50 51
// a1357_lo:
// 08 09 24 25 40 41 56 57
// 10 11 26 27 42 43 58 59
// a0246_hi:
// 04 05 20 21 36 37 52 53
// 06 07 22 23 38 39 54 55
// a1657_hi:
// 12 13 28 29 44 45 60 61
// 14 15 30 31 46 47 62 63
const int32x4x2_t a0246_lo =
vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
const int32x4x2_t a1357_lo =
vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
const int32x4x2_t a0246_hi =
vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
const int32x4x2_t a1357_hi =
vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
// Swap 16 bit elements resulting in:
// b0:
// 00 08 16 24 32 40 48 56
// 01 09 17 25 33 41 49 57
// b1:
// 02 10 18 26 34 42 50 58
// 03 11 19 27 35 43 51 59
// b2:
// 04 12 20 28 36 44 52 60
// 05 13 21 29 37 45 53 61
// b3:
// 06 14 22 30 38 46 54 62
// 07 15 23 31 39 47 55 63
const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
vreinterpretq_s16_s32(a1357_lo.val[0]));
const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
vreinterpretq_s16_s32(a1357_lo.val[1]));
const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
vreinterpretq_s16_s32(a1357_hi.val[0]));
const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
vreinterpretq_s16_s32(a1357_hi.val[1]));
*a0 = b0.val[0];
*a1 = b0.val[1];
*a2 = b1.val[0];
*a3 = b1.val[1];
*a4 = b2.val[0];
*a5 = b2.val[1];
*a6 = b3.val[0];
*a7 = b3.val[1];
}
void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int16x8_t a0 = vld1q_s16(src_diff);
int16x8_t a1 = vld1q_s16(src_diff + src_stride);
int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
// Skip the second transpose because it is not required.
vst1q_s16(coeff + 0, a0);
vst1q_s16(coeff + 8, a1);
vst1q_s16(coeff + 16, a2);
vst1q_s16(coeff + 24, a3);
vst1q_s16(coeff + 32, a4);
vst1q_s16(coeff + 40, a5);
vst1q_s16(coeff + 48, a6);
vst1q_s16(coeff + 56, a7);
}
void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int i;
/* Rearrange 16x16 to 8x32 and remove stride.
* Top left first. */
aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
/* Top right. */
aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
/* Bottom left. */
aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
/* Bottom right. */
aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
for (i = 0; i < 64; i += 8) {
const int16x8_t a0 = vld1q_s16(coeff + 0);
const int16x8_t a1 = vld1q_s16(coeff + 64);
const int16x8_t a2 = vld1q_s16(coeff + 128);
const int16x8_t a3 = vld1q_s16(coeff + 192);
const int16x8_t b0 = vhaddq_s16(a0, a1);
const int16x8_t b1 = vhsubq_s16(a0, a1);
const int16x8_t b2 = vhaddq_s16(a2, a3);
const int16x8_t b3 = vhsubq_s16(a2, a3);
const int16x8_t c0 = vaddq_s16(b0, b2);
const int16x8_t c1 = vaddq_s16(b1, b3);
const int16x8_t c2 = vsubq_s16(b0, b2);
const int16x8_t c3 = vsubq_s16(b1, b3);
vst1q_s16(coeff + 0, c0);
vst1q_s16(coeff + 64, c1);
vst1q_s16(coeff + 128, c2);
vst1q_s16(coeff + 192, c3);
coeff += 8;
}
}

View File

@@ -1,59 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "aom_dsp/inv_txfm.h"
#include "aom_ports/mem.h"
void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
int16x8_t q0s16;
uint8_t *d1, *d2;
int16_t i, j, a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
q0s16 = vdupq_n_s16(a1);
q0u16 = vreinterpretq_u16_s16(q0s16);
for (d1 = d2 = dest, i = 0; i < 4; i++) {
for (j = 0; j < 2; j++) {
d2u64 = vld1_u64((const uint64_t *)d1);
d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += dest_stride;
d4u64 = vld1_u64((const uint64_t *)d1);
d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
d1 += dest_stride;
q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
d2 += dest_stride;
}
}
return;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,152 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "aom_dsp/aom_dsp_common.h"
void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output,
int output_stride);
void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
int16_t *pass1Output, int16_t skip_adding,
uint8_t *dest, int dest_stride);
void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output,
int output_stride);
void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output,
int16_t *pass1Output, int16_t skip_adding,
uint8_t *dest, int dest_stride);
#if HAVE_NEON_ASM
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
extern void aom_push_neon(int64_t *store);
extern void aom_pop_neon(int64_t *store);
#endif // HAVE_NEON_ASM
void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
int16_t pass1_output[16 * 16] = { 0 };
int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM
// save d8-d15 register values.
aom_push_neon(store_reg);
#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
dest, dest_stride);
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
pass1_output, 0, dest, dest_stride);
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
pass1_output, 1, dest, dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
dest + 8, dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
aom_pop_neon(store_reg);
#endif
return;
}
void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest,
int dest_stride) {
#if HAVE_NEON_ASM
int64_t store_reg[8];
#endif
int16_t pass1_output[16 * 16] = { 0 };
int16_t row_idct_output[16 * 16] = { 0 };
#if HAVE_NEON_ASM
// save d8-d15 register values.
aom_push_neon(store_reg);
#endif
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
dest, dest_stride);
/* Skip Parallel idct on the lower 8 rows as they are all 0s */
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
pass1_output, 1, dest, dest_stride);
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
row_idct_output + 8, pass1_output, 1,
dest + 8, dest_stride);
#if HAVE_NEON_ASM
// restore d8-d15 register values.
aom_pop_neon(store_reg);
#endif
return;
}

View File

@@ -1,141 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "aom_dsp/inv_txfm.h"
#include "aom_ports/mem.h"
static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vld1q_u8(d);
d += d_stride;
*q9u8 = vld1q_u8(d);
d += d_stride;
*q10u8 = vld1q_u8(d);
d += d_stride;
*q11u8 = vld1q_u8(d);
d += d_stride;
*q12u8 = vld1q_u8(d);
d += d_stride;
*q13u8 = vld1q_u8(d);
d += d_stride;
*q14u8 = vld1q_u8(d);
d += d_stride;
*q15u8 = vld1q_u8(d);
return;
}
static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqaddq_u8(*q8u8, qdiffu8);
*q9u8 = vqaddq_u8(*q9u8, qdiffu8);
*q10u8 = vqaddq_u8(*q10u8, qdiffu8);
*q11u8 = vqaddq_u8(*q11u8, qdiffu8);
*q12u8 = vqaddq_u8(*q12u8, qdiffu8);
*q13u8 = vqaddq_u8(*q13u8, qdiffu8);
*q14u8 = vqaddq_u8(*q14u8, qdiffu8);
*q15u8 = vqaddq_u8(*q15u8, qdiffu8);
return;
}
static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
*q8u8 = vqsubq_u8(*q8u8, qdiffu8);
*q9u8 = vqsubq_u8(*q9u8, qdiffu8);
*q10u8 = vqsubq_u8(*q10u8, qdiffu8);
*q11u8 = vqsubq_u8(*q11u8, qdiffu8);
*q12u8 = vqsubq_u8(*q12u8, qdiffu8);
*q13u8 = vqsubq_u8(*q13u8, qdiffu8);
*q14u8 = vqsubq_u8(*q14u8, qdiffu8);
*q15u8 = vqsubq_u8(*q15u8, qdiffu8);
return;
}
static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8,
uint8x16_t *q9u8, uint8x16_t *q10u8,
uint8x16_t *q11u8, uint8x16_t *q12u8,
uint8x16_t *q13u8, uint8x16_t *q14u8,
uint8x16_t *q15u8) {
vst1q_u8(d, *q8u8);
d += d_stride;
vst1q_u8(d, *q9u8);
d += d_stride;
vst1q_u8(d, *q10u8);
d += d_stride;
vst1q_u8(d, *q11u8);
d += d_stride;
vst1q_u8(d, *q12u8);
d += d_stride;
vst1q_u8(d, *q13u8);
d += d_stride;
vst1q_u8(d, *q14u8);
d += d_stride;
vst1q_u8(d, *q15u8);
return;
}
void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int i, j, dest_stride8;
uint8_t *d;
int16_t a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
dest_stride8 = dest_stride * 8;
if (a1 >= 0) { // diff_positive_32_32
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8(a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
d += dest_stride8;
}
}
} else { // diff_negative_32_32
a1 = -a1;
a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
q0u8 = vdupq_n_u8(a1);
for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop
d = dest;
for (j = 0; j < 4; j++) {
LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8,
&q14u8, &q15u8);
d += dest_stride8;
}
}
}
return;
}

View File

@@ -1,686 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "aom_dsp/txfm_common.h"
#define LOAD_FROM_TRANSPOSED(prev, first, second) \
q14s16 = vld1q_s16(trans_buf + first * 8); \
q13s16 = vld1q_s16(trans_buf + second * 8);
#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
qA = vld1q_s16(out + first * 32); \
qB = vld1q_s16(out + second * 32);
#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
vst1q_s16(out + first * 32, qA); \
vst1q_s16(out + second * 32, qB);
#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
__STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
int stride, int16x8_t q6s16,
int16x8_t q7s16,
int16x8_t q8s16,
int16x8_t q9s16) {
int16x4_t d8s16, d9s16, d10s16, d11s16;
d8s16 = vld1_s16((int16_t *)p1);
p1 += stride;
d11s16 = vld1_s16((int16_t *)p2);
p2 -= stride;
d9s16 = vld1_s16((int16_t *)p1);
d10s16 = vld1_s16((int16_t *)p2);
q7s16 = vrshrq_n_s16(q7s16, 6);
q8s16 = vrshrq_n_s16(q8s16, 6);
q9s16 = vrshrq_n_s16(q9s16, 6);
q6s16 = vrshrq_n_s16(q6s16, 6);
q7s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
q8s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
q9s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
q6s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
vst1_s16((int16_t *)p1, d9s16);
p1 -= stride;
vst1_s16((int16_t *)p2, d10s16);
p2 += stride;
vst1_s16((int16_t *)p1, d8s16);
vst1_s16((int16_t *)p2, d11s16);
return;
}
#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
; \
__STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
int stride, int16x8_t q4s16,
int16x8_t q5s16,
int16x8_t q6s16,
int16x8_t q7s16) {
int16x4_t d4s16, d5s16, d6s16, d7s16;
d4s16 = vld1_s16((int16_t *)p1);
p1 += stride;
d7s16 = vld1_s16((int16_t *)p2);
p2 -= stride;
d5s16 = vld1_s16((int16_t *)p1);
d6s16 = vld1_s16((int16_t *)p2);
q5s16 = vrshrq_n_s16(q5s16, 6);
q6s16 = vrshrq_n_s16(q6s16, 6);
q7s16 = vrshrq_n_s16(q7s16, 6);
q4s16 = vrshrq_n_s16(q4s16, 6);
q5s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
q6s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
q7s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
q4s16 = vreinterpretq_s16_u16(
vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
vst1_s16((int16_t *)p1, d5s16);
p1 -= stride;
vst1_s16((int16_t *)p2, d6s16);
p2 += stride;
vst1_s16((int16_t *)p2, d7s16);
vst1_s16((int16_t *)p1, d4s16);
return;
}
#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
int16_t first_const, int16_t second_const,
int16x8_t *qAs16, int16x8_t *qBs16) {
int16x4_t d30s16, d31s16;
int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
int16x4_t dCs16, dDs16, dAs16, dBs16;
dCs16 = vget_low_s16(q14s16);
dDs16 = vget_high_s16(q14s16);
dAs16 = vget_low_s16(q13s16);
dBs16 = vget_high_s16(q13s16);
d30s16 = vdup_n_s16(first_const);
d31s16 = vdup_n_s16(second_const);
q8s32 = vmull_s16(dCs16, d30s16);
q10s32 = vmull_s16(dAs16, d31s16);
q9s32 = vmull_s16(dDs16, d30s16);
q11s32 = vmull_s16(dBs16, d31s16);
q12s32 = vmull_s16(dCs16, d31s16);
q8s32 = vsubq_s32(q8s32, q10s32);
q9s32 = vsubq_s32(q9s32, q11s32);
q10s32 = vmull_s16(dDs16, d31s16);
q11s32 = vmull_s16(dAs16, d30s16);
q15s32 = vmull_s16(dBs16, d30s16);
q11s32 = vaddq_s32(q12s32, q11s32);
q10s32 = vaddq_s32(q10s32, q15s32);
*qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
*qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
return;
}
static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) {
int16_t *in;
int i;
const int stride = 32;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
for (i = 0; i < 4; i++, input += 8) {
in = input;
q8s16 = vld1q_s16(in);
in += stride;
q9s16 = vld1q_s16(in);
in += stride;
q10s16 = vld1q_s16(in);
in += stride;
q11s16 = vld1q_s16(in);
in += stride;
q12s16 = vld1q_s16(in);
in += stride;
q13s16 = vld1q_s16(in);
in += stride;
q14s16 = vld1q_s16(in);
in += stride;
q15s16 = vld1q_s16(in);
d16s16 = vget_low_s16(q8s16);
d17s16 = vget_high_s16(q8s16);
d18s16 = vget_low_s16(q9s16);
d19s16 = vget_high_s16(q9s16);
d20s16 = vget_low_s16(q10s16);
d21s16 = vget_high_s16(q10s16);
d22s16 = vget_low_s16(q11s16);
d23s16 = vget_high_s16(q11s16);
d24s16 = vget_low_s16(q12s16);
d25s16 = vget_high_s16(q12s16);
d26s16 = vget_low_s16(q13s16);
d27s16 = vget_high_s16(q13s16);
d28s16 = vget_low_s16(q14s16);
d29s16 = vget_high_s16(q14s16);
d30s16 = vget_low_s16(q15s16);
d31s16 = vget_high_s16(q15s16);
q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
q12s16 = vcombine_s16(d17s16, d25s16);
q13s16 = vcombine_s16(d19s16, d27s16);
q14s16 = vcombine_s16(d21s16, d29s16);
q15s16 = vcombine_s16(d23s16, d31s16);
q0x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16));
q1x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16));
q2x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16));
q3x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16));
q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
vst1q_s16(t_buf, q0x2s16.val[0]);
t_buf += 8;
vst1q_s16(t_buf, q0x2s16.val[1]);
t_buf += 8;
vst1q_s16(t_buf, q1x2s16.val[0]);
t_buf += 8;
vst1q_s16(t_buf, q1x2s16.val[1]);
t_buf += 8;
vst1q_s16(t_buf, q2x2s16.val[0]);
t_buf += 8;
vst1q_s16(t_buf, q2x2s16.val[1]);
t_buf += 8;
vst1q_s16(t_buf, q3x2s16.val[0]);
t_buf += 8;
vst1q_s16(t_buf, q3x2s16.val[1]);
t_buf += 8;
}
return;
}
static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
int16x8_t q3s16, int16x8_t q6s16,
int16x8_t q7s16, int16x8_t q8s16,
int16x8_t q9s16, int16x8_t q10s16,
int16x8_t q11s16, int16x8_t q12s16,
int16x8_t q13s16, int16x8_t q14s16,
int16x8_t q15s16) {
int16x8_t q0s16, q1s16, q4s16, q5s16;
STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
q2s16 = vaddq_s16(q10s16, q1s16);
q3s16 = vaddq_s16(q11s16, q0s16);
q4s16 = vsubq_s16(q11s16, q0s16);
q5s16 = vsubq_s16(q10s16, q1s16);
LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
q8s16 = vaddq_s16(q4s16, q1s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16);
STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
q2s16 = vaddq_s16(q12s16, q1s16);
q3s16 = vaddq_s16(q13s16, q0s16);
q4s16 = vsubq_s16(q13s16, q0s16);
q5s16 = vsubq_s16(q12s16, q1s16);
LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
q8s16 = vaddq_s16(q4s16, q1s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16);
STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
q2s16 = vaddq_s16(q14s16, q1s16);
q3s16 = vaddq_s16(q15s16, q0s16);
q4s16 = vsubq_s16(q15s16, q0s16);
q5s16 = vsubq_s16(q14s16, q1s16);
LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
q8s16 = vaddq_s16(q4s16, q1s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16);
STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
return;
}
static INLINE void idct32_bands_end_2nd_pass(
int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
int16x8_t q14s16, int16x8_t q15s16) {
uint8_t *r6 = dest + 31 * stride;
uint8_t *r7 = dest /* + 0 * stride*/;
uint8_t *r9 = dest + 15 * stride;
uint8_t *r10 = dest + 16 * stride;
int str2 = stride << 1;
int16x8_t q0s16, q1s16, q4s16, q5s16;
STORE_COMBINE_CENTER_RESULTS(r10, r9);
r10 += str2;
r9 -= str2;
LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_COMBINE_EXTREME_RESULTS(r7, r6);
r7 += str2;
r6 -= str2;
LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
q2s16 = vaddq_s16(q10s16, q1s16);
q3s16 = vaddq_s16(q11s16, q0s16);
q4s16 = vsubq_s16(q11s16, q0s16);
q5s16 = vsubq_s16(q10s16, q1s16);
LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
q8s16 = vaddq_s16(q4s16, q1s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16);
STORE_COMBINE_CENTER_RESULTS(r10, r9);
r10 += str2;
r9 -= str2;
LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_COMBINE_EXTREME_RESULTS(r7, r6);
r7 += str2;
r6 -= str2;
LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
q2s16 = vaddq_s16(q12s16, q1s16);
q3s16 = vaddq_s16(q13s16, q0s16);
q4s16 = vsubq_s16(q13s16, q0s16);
q5s16 = vsubq_s16(q12s16, q1s16);
LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
q8s16 = vaddq_s16(q4s16, q1s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16);
STORE_COMBINE_CENTER_RESULTS(r10, r9);
r10 += str2;
r9 -= str2;
LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_COMBINE_EXTREME_RESULTS(r7, r6);
r7 += str2;
r6 -= str2;
LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
q2s16 = vaddq_s16(q14s16, q1s16);
q3s16 = vaddq_s16(q15s16, q0s16);
q4s16 = vsubq_s16(q15s16, q0s16);
q5s16 = vsubq_s16(q14s16, q1s16);
LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
q8s16 = vaddq_s16(q4s16, q1s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16);
STORE_COMBINE_CENTER_RESULTS(r10, r9);
LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
q4s16 = vaddq_s16(q2s16, q1s16);
q5s16 = vaddq_s16(q3s16, q0s16);
q6s16 = vsubq_s16(q3s16, q0s16);
q7s16 = vsubq_s16(q2s16, q1s16);
STORE_COMBINE_EXTREME_RESULTS(r7, r6);
return;
}
void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) {
int i, idct32_pass_loop;
int16_t trans_buf[32 * 8];
int16_t pass1[32 * 32];
int16_t pass2[32 * 32];
int16_t *out;
int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
idct32_pass_loop++,
input = pass1, // the input of pass2 is the result of pass1
out = pass2) {
for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
idct32_transpose_pair(input, trans_buf);
// -----------------------------------------
// BLOCK A: 16-19,28-31
// -----------------------------------------
// generate 16,17,30,31
// part of stage 1
LOAD_FROM_TRANSPOSED(0, 1, 31)
DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
LOAD_FROM_TRANSPOSED(31, 17, 15)
DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
// part of stage 2
q4s16 = vaddq_s16(q0s16, q1s16);
q13s16 = vsubq_s16(q0s16, q1s16);
q6s16 = vaddq_s16(q2s16, q3s16);
q14s16 = vsubq_s16(q2s16, q3s16);
// part of stage 3
DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
// generate 18,19,28,29
// part of stage 1
LOAD_FROM_TRANSPOSED(15, 9, 23)
DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
LOAD_FROM_TRANSPOSED(23, 25, 7)
DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
// part of stage 2
q13s16 = vsubq_s16(q3s16, q2s16);
q3s16 = vaddq_s16(q3s16, q2s16);
q14s16 = vsubq_s16(q1s16, q0s16);
q2s16 = vaddq_s16(q1s16, q0s16);
// part of stage 3
DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
// part of stage 4
q8s16 = vaddq_s16(q4s16, q2s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q10s16 = vaddq_s16(q7s16, q1s16);
q15s16 = vaddq_s16(q6s16, q3s16);
q13s16 = vsubq_s16(q5s16, q0s16);
q14s16 = vsubq_s16(q7s16, q1s16);
STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
// part of stage 5
DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
// part of stage 4
q13s16 = vsubq_s16(q4s16, q2s16);
q14s16 = vsubq_s16(q6s16, q3s16);
// part of stage 5
DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
// -----------------------------------------
// BLOCK B: 20-23,24-27
// -----------------------------------------
// generate 20,21,26,27
// part of stage 1
LOAD_FROM_TRANSPOSED(7, 5, 27)
DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
LOAD_FROM_TRANSPOSED(27, 21, 11)
DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
// part of stage 2
q13s16 = vsubq_s16(q0s16, q1s16);
q0s16 = vaddq_s16(q0s16, q1s16);
q14s16 = vsubq_s16(q2s16, q3s16);
q2s16 = vaddq_s16(q2s16, q3s16);
// part of stage 3
DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
// generate 22,23,24,25
// part of stage 1
LOAD_FROM_TRANSPOSED(11, 13, 19)
DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
LOAD_FROM_TRANSPOSED(19, 29, 3)
DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
// part of stage 2
q14s16 = vsubq_s16(q4s16, q5s16);
q5s16 = vaddq_s16(q4s16, q5s16);
q13s16 = vsubq_s16(q6s16, q7s16);
q6s16 = vaddq_s16(q6s16, q7s16);
// part of stage 3
DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
// part of stage 4
q10s16 = vaddq_s16(q7s16, q1s16);
q11s16 = vaddq_s16(q5s16, q0s16);
q12s16 = vaddq_s16(q6s16, q2s16);
q15s16 = vaddq_s16(q4s16, q3s16);
// part of stage 6
LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
q8s16 = vaddq_s16(q14s16, q11s16);
q9s16 = vaddq_s16(q13s16, q10s16);
q13s16 = vsubq_s16(q13s16, q10s16);
q11s16 = vsubq_s16(q14s16, q11s16);
STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
q8s16 = vsubq_s16(q9s16, q12s16);
q10s16 = vaddq_s16(q14s16, q15s16);
q14s16 = vsubq_s16(q14s16, q15s16);
q12s16 = vaddq_s16(q9s16, q12s16);
STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
// part of stage 7
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
q13s16 = q11s16;
q14s16 = q8s16;
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
// part of stage 4
q14s16 = vsubq_s16(q5s16, q0s16);
q13s16 = vsubq_s16(q6s16, q2s16);
DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
q14s16 = vsubq_s16(q7s16, q1s16);
q13s16 = vsubq_s16(q4s16, q3s16);
DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
// part of stage 6
LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
q8s16 = vaddq_s16(q14s16, q1s16);
q9s16 = vaddq_s16(q13s16, q6s16);
q13s16 = vsubq_s16(q13s16, q6s16);
q1s16 = vsubq_s16(q14s16, q1s16);
STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
q14s16 = vsubq_s16(q8s16, q5s16);
q10s16 = vaddq_s16(q8s16, q5s16);
q11s16 = vaddq_s16(q9s16, q0s16);
q0s16 = vsubq_s16(q9s16, q0s16);
STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
// part of stage 7
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
// -----------------------------------------
// BLOCK C: 8-10,11-15
// -----------------------------------------
// generate 8,9,14,15
// part of stage 2
LOAD_FROM_TRANSPOSED(3, 2, 30)
DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
LOAD_FROM_TRANSPOSED(30, 18, 14)
DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
// part of stage 3
q13s16 = vsubq_s16(q0s16, q1s16);
q0s16 = vaddq_s16(q0s16, q1s16);
q14s16 = vsubq_s16(q2s16, q3s16);
q2s16 = vaddq_s16(q2s16, q3s16);
// part of stage 4
DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
// generate 10,11,12,13
// part of stage 2
LOAD_FROM_TRANSPOSED(14, 10, 22)
DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
LOAD_FROM_TRANSPOSED(22, 26, 6)
DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
// part of stage 3
q14s16 = vsubq_s16(q4s16, q5s16);
q5s16 = vaddq_s16(q4s16, q5s16);
q13s16 = vsubq_s16(q6s16, q7s16);
q6s16 = vaddq_s16(q6s16, q7s16);
// part of stage 4
DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
// part of stage 5
q8s16 = vaddq_s16(q0s16, q5s16);
q9s16 = vaddq_s16(q1s16, q7s16);
q13s16 = vsubq_s16(q1s16, q7s16);
q14s16 = vsubq_s16(q3s16, q4s16);
q10s16 = vaddq_s16(q3s16, q4s16);
q15s16 = vaddq_s16(q2s16, q6s16);
STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
// part of stage 6
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
q13s16 = vsubq_s16(q0s16, q5s16);
q14s16 = vsubq_s16(q2s16, q6s16);
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
// -----------------------------------------
// BLOCK D: 0-3,4-7
// -----------------------------------------
// generate 4,5,6,7
// part of stage 3
LOAD_FROM_TRANSPOSED(6, 4, 28)
DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
LOAD_FROM_TRANSPOSED(28, 20, 12)
DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
// part of stage 4
q13s16 = vsubq_s16(q0s16, q1s16);
q0s16 = vaddq_s16(q0s16, q1s16);
q14s16 = vsubq_s16(q2s16, q3s16);
q2s16 = vaddq_s16(q2s16, q3s16);
// part of stage 5
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
// generate 0,1,2,3
// part of stage 4
LOAD_FROM_TRANSPOSED(12, 0, 16)
DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
LOAD_FROM_TRANSPOSED(16, 8, 24)
DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
// part of stage 5
q4s16 = vaddq_s16(q7s16, q6s16);
q7s16 = vsubq_s16(q7s16, q6s16);
q6s16 = vsubq_s16(q5s16, q14s16);
q5s16 = vaddq_s16(q5s16, q14s16);
// part of stage 6
q8s16 = vaddq_s16(q4s16, q2s16);
q9s16 = vaddq_s16(q5s16, q3s16);
q10s16 = vaddq_s16(q6s16, q1s16);
q11s16 = vaddq_s16(q7s16, q0s16);
q12s16 = vsubq_s16(q7s16, q0s16);
q13s16 = vsubq_s16(q6s16, q1s16);
q14s16 = vsubq_s16(q5s16, q3s16);
q15s16 = vsubq_s16(q4s16, q2s16);
// part of stage 7
LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
q2s16 = vaddq_s16(q8s16, q1s16);
q3s16 = vaddq_s16(q9s16, q0s16);
q4s16 = vsubq_s16(q9s16, q0s16);
q5s16 = vsubq_s16(q8s16, q1s16);
LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
q8s16 = vaddq_s16(q4s16, q1s16);
q9s16 = vaddq_s16(q5s16, q0s16);
q6s16 = vsubq_s16(q5s16, q0s16);
q7s16 = vsubq_s16(q4s16, q1s16);
if (idct32_pass_loop == 0) {
idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
q10s16, q11s16, q12s16, q13s16, q14s16,
q15s16);
} else {
idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
q14s16, q15s16);
dest += 8;
}
}
}
return;
}

View File

@@ -1,47 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "aom_dsp/inv_txfm.h"
#include "aom_ports/mem.h"
void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8x8_t d6u8;
uint32x2_t d2u32 = vdup_n_u32(0);
uint16x8_t q8u16;
int16x8_t q0s16;
uint8_t *d1, *d2;
int16_t i, a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 4);
q0s16 = vdupq_n_s16(a1);
// dc_only_idct_add
d1 = d2 = dest;
for (i = 0; i < 2; i++) {
d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
d1 += dest_stride;
d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32));
d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
d2 += dest_stride;
vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
d2 += dest_stride;
}
return;
}

View File

@@ -1,146 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "aom_dsp/txfm_common.h"
void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8x8_t d26u8, d27u8;
uint32x2_t d26u32, d27u32;
uint16x8_t q8u16, q9u16;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
int16x8_t q8s16, q9s16, q13s16, q14s16;
int32x4_t q1s32, q13s32, q14s32, q15s32;
int16x4x2_t d0x2s16, d1x2s16;
int32x4x2_t q0x2s32;
uint8_t *d;
d26u32 = d27u32 = vdup_n_u32(0);
q8s16 = vld1q_s16(input);
q9s16 = vld1q_s16(input + 8);
d16s16 = vget_low_s16(q8s16);
d17s16 = vget_high_s16(q8s16);
d18s16 = vget_low_s16(q9s16);
d19s16 = vget_high_s16(q9s16);
d0x2s16 = vtrn_s16(d16s16, d17s16);
d1x2s16 = vtrn_s16(d18s16, d19s16);
q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
d20s16 = vdup_n_s16((int16_t)cospi_8_64);
d21s16 = vdup_n_s16((int16_t)cospi_16_64);
q0x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
d22s16 = vdup_n_s16((int16_t)cospi_24_64);
// stage 1
d23s16 = vadd_s16(d16s16, d18s16);
d24s16 = vsub_s16(d16s16, d18s16);
q15s32 = vmull_s16(d17s16, d22s16);
q1s32 = vmull_s16(d17s16, d20s16);
q13s32 = vmull_s16(d23s16, d21s16);
q14s32 = vmull_s16(d24s16, d21s16);
q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
d26s16 = vqrshrn_n_s32(q13s32, 14);
d27s16 = vqrshrn_n_s32(q14s32, 14);
d29s16 = vqrshrn_n_s32(q15s32, 14);
d28s16 = vqrshrn_n_s32(q1s32, 14);
q13s16 = vcombine_s16(d26s16, d27s16);
q14s16 = vcombine_s16(d28s16, d29s16);
// stage 2
q8s16 = vaddq_s16(q13s16, q14s16);
q9s16 = vsubq_s16(q13s16, q14s16);
d16s16 = vget_low_s16(q8s16);
d17s16 = vget_high_s16(q8s16);
d18s16 = vget_high_s16(q9s16); // vswp d18 d19
d19s16 = vget_low_s16(q9s16);
d0x2s16 = vtrn_s16(d16s16, d17s16);
d1x2s16 = vtrn_s16(d18s16, d19s16);
q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
q0x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16));
d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
// do the transform on columns
// stage 1
d23s16 = vadd_s16(d16s16, d18s16);
d24s16 = vsub_s16(d16s16, d18s16);
q15s32 = vmull_s16(d17s16, d22s16);
q1s32 = vmull_s16(d17s16, d20s16);
q13s32 = vmull_s16(d23s16, d21s16);
q14s32 = vmull_s16(d24s16, d21s16);
q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
q1s32 = vmlal_s16(q1s32, d19s16, d22s16);
d26s16 = vqrshrn_n_s32(q13s32, 14);
d27s16 = vqrshrn_n_s32(q14s32, 14);
d29s16 = vqrshrn_n_s32(q15s32, 14);
d28s16 = vqrshrn_n_s32(q1s32, 14);
q13s16 = vcombine_s16(d26s16, d27s16);
q14s16 = vcombine_s16(d28s16, d29s16);
// stage 2
q8s16 = vaddq_s16(q13s16, q14s16);
q9s16 = vsubq_s16(q13s16, q14s16);
q8s16 = vrshrq_n_s16(q8s16, 4);
q9s16 = vrshrq_n_s16(q9s16, 4);
d = dest;
d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
d += dest_stride;
d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
d += dest_stride;
d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
d += dest_stride;
d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d = dest;
vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
d += dest_stride;
vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
d += dest_stride;
vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
d += dest_stride;
vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
return;
}

View File

@@ -1,62 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "aom_dsp/inv_txfm.h"
#include "aom_ports/mem.h"
void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8x8_t d2u8, d3u8, d30u8, d31u8;
uint64x1_t d2u64, d3u64, d4u64, d5u64;
uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
int16x8_t q0s16;
uint8_t *d1, *d2;
int16_t i, a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 5);
q0s16 = vdupq_n_s16(a1);
q0u16 = vreinterpretq_u16_s16(q0s16);
d1 = d2 = dest;
for (i = 0; i < 2; i++) {
d2u64 = vld1_u64((const uint64_t *)d1);
d1 += dest_stride;
d3u64 = vld1_u64((const uint64_t *)d1);
d1 += dest_stride;
d4u64 = vld1_u64((const uint64_t *)d1);
d1 += dest_stride;
d5u64 = vld1_u64((const uint64_t *)d1);
d1 += dest_stride;
q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
d2 += dest_stride;
}
return;
}

View File

@@ -1,509 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "aom_dsp/txfm_common.h"
static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
int16x8_t *q10s16, int16x8_t *q11s16,
int16x8_t *q12s16, int16x8_t *q13s16,
int16x8_t *q14s16, int16x8_t *q15s16) {
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
d16s16 = vget_low_s16(*q8s16);
d17s16 = vget_high_s16(*q8s16);
d18s16 = vget_low_s16(*q9s16);
d19s16 = vget_high_s16(*q9s16);
d20s16 = vget_low_s16(*q10s16);
d21s16 = vget_high_s16(*q10s16);
d22s16 = vget_low_s16(*q11s16);
d23s16 = vget_high_s16(*q11s16);
d24s16 = vget_low_s16(*q12s16);
d25s16 = vget_high_s16(*q12s16);
d26s16 = vget_low_s16(*q13s16);
d27s16 = vget_high_s16(*q13s16);
d28s16 = vget_low_s16(*q14s16);
d29s16 = vget_high_s16(*q14s16);
d30s16 = vget_low_s16(*q15s16);
d31s16 = vget_high_s16(*q15s16);
*q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
*q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
*q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
*q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
*q12s16 = vcombine_s16(d17s16, d25s16);
*q13s16 = vcombine_s16(d19s16, d27s16);
*q14s16 = vcombine_s16(d21s16, d29s16);
*q15s16 = vcombine_s16(d23s16, d31s16);
q0x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
q1x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
q2x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
q3x2s32 =
vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
*q8s16 = q0x2s16.val[0];
*q9s16 = q0x2s16.val[1];
*q10s16 = q1x2s16.val[0];
*q11s16 = q1x2s16.val[1];
*q12s16 = q2x2s16.val[0];
*q13s16 = q2x2s16.val[1];
*q14s16 = q3x2s16.val[0];
*q15s16 = q3x2s16.val[1];
return;
}
static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
int16x8_t *q10s16, int16x8_t *q11s16,
int16x8_t *q12s16, int16x8_t *q13s16,
int16x8_t *q14s16, int16x8_t *q15s16) {
int16x4_t d0s16, d1s16, d2s16, d3s16;
int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
d0s16 = vdup_n_s16((int16_t)cospi_28_64);
d1s16 = vdup_n_s16((int16_t)cospi_4_64);
d2s16 = vdup_n_s16((int16_t)cospi_12_64);
d3s16 = vdup_n_s16((int16_t)cospi_20_64);
d16s16 = vget_low_s16(*q8s16);
d17s16 = vget_high_s16(*q8s16);
d18s16 = vget_low_s16(*q9s16);
d19s16 = vget_high_s16(*q9s16);
d20s16 = vget_low_s16(*q10s16);
d21s16 = vget_high_s16(*q10s16);
d22s16 = vget_low_s16(*q11s16);
d23s16 = vget_high_s16(*q11s16);
d24s16 = vget_low_s16(*q12s16);
d25s16 = vget_high_s16(*q12s16);
d26s16 = vget_low_s16(*q13s16);
d27s16 = vget_high_s16(*q13s16);
d28s16 = vget_low_s16(*q14s16);
d29s16 = vget_high_s16(*q14s16);
d30s16 = vget_low_s16(*q15s16);
d31s16 = vget_high_s16(*q15s16);
q2s32 = vmull_s16(d18s16, d0s16);
q3s32 = vmull_s16(d19s16, d0s16);
q5s32 = vmull_s16(d26s16, d2s16);
q6s32 = vmull_s16(d27s16, d2s16);
q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
d8s16 = vqrshrn_n_s32(q2s32, 14);
d9s16 = vqrshrn_n_s32(q3s32, 14);
d10s16 = vqrshrn_n_s32(q5s32, 14);
d11s16 = vqrshrn_n_s32(q6s32, 14);
q4s16 = vcombine_s16(d8s16, d9s16);
q5s16 = vcombine_s16(d10s16, d11s16);
q2s32 = vmull_s16(d18s16, d1s16);
q3s32 = vmull_s16(d19s16, d1s16);
q9s32 = vmull_s16(d26s16, d3s16);
q13s32 = vmull_s16(d27s16, d3s16);
q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
d14s16 = vqrshrn_n_s32(q2s32, 14);
d15s16 = vqrshrn_n_s32(q3s32, 14);
d12s16 = vqrshrn_n_s32(q9s32, 14);
d13s16 = vqrshrn_n_s32(q13s32, 14);
q6s16 = vcombine_s16(d12s16, d13s16);
q7s16 = vcombine_s16(d14s16, d15s16);
d0s16 = vdup_n_s16((int16_t)cospi_16_64);
q2s32 = vmull_s16(d16s16, d0s16);
q3s32 = vmull_s16(d17s16, d0s16);
q13s32 = vmull_s16(d16s16, d0s16);
q15s32 = vmull_s16(d17s16, d0s16);
q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
d0s16 = vdup_n_s16((int16_t)cospi_24_64);
d1s16 = vdup_n_s16((int16_t)cospi_8_64);
d18s16 = vqrshrn_n_s32(q2s32, 14);
d19s16 = vqrshrn_n_s32(q3s32, 14);
d22s16 = vqrshrn_n_s32(q13s32, 14);
d23s16 = vqrshrn_n_s32(q15s32, 14);
*q9s16 = vcombine_s16(d18s16, d19s16);
*q11s16 = vcombine_s16(d22s16, d23s16);
q2s32 = vmull_s16(d20s16, d0s16);
q3s32 = vmull_s16(d21s16, d0s16);
q8s32 = vmull_s16(d20s16, d1s16);
q12s32 = vmull_s16(d21s16, d1s16);
q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
d26s16 = vqrshrn_n_s32(q2s32, 14);
d27s16 = vqrshrn_n_s32(q3s32, 14);
d30s16 = vqrshrn_n_s32(q8s32, 14);
d31s16 = vqrshrn_n_s32(q12s32, 14);
*q13s16 = vcombine_s16(d26s16, d27s16);
*q15s16 = vcombine_s16(d30s16, d31s16);
q0s16 = vaddq_s16(*q9s16, *q15s16);
q1s16 = vaddq_s16(*q11s16, *q13s16);
q2s16 = vsubq_s16(*q11s16, *q13s16);
q3s16 = vsubq_s16(*q9s16, *q15s16);
*q13s16 = vsubq_s16(q4s16, q5s16);
q4s16 = vaddq_s16(q4s16, q5s16);
*q14s16 = vsubq_s16(q7s16, q6s16);
q7s16 = vaddq_s16(q7s16, q6s16);
d26s16 = vget_low_s16(*q13s16);
d27s16 = vget_high_s16(*q13s16);
d28s16 = vget_low_s16(*q14s16);
d29s16 = vget_high_s16(*q14s16);
d16s16 = vdup_n_s16((int16_t)cospi_16_64);
q9s32 = vmull_s16(d28s16, d16s16);
q10s32 = vmull_s16(d29s16, d16s16);
q11s32 = vmull_s16(d28s16, d16s16);
q12s32 = vmull_s16(d29s16, d16s16);
q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
d10s16 = vqrshrn_n_s32(q9s32, 14);
d11s16 = vqrshrn_n_s32(q10s32, 14);
d12s16 = vqrshrn_n_s32(q11s32, 14);
d13s16 = vqrshrn_n_s32(q12s32, 14);
q5s16 = vcombine_s16(d10s16, d11s16);
q6s16 = vcombine_s16(d12s16, d13s16);
*q8s16 = vaddq_s16(q0s16, q7s16);
*q9s16 = vaddq_s16(q1s16, q6s16);
*q10s16 = vaddq_s16(q2s16, q5s16);
*q11s16 = vaddq_s16(q3s16, q4s16);
*q12s16 = vsubq_s16(q3s16, q4s16);
*q13s16 = vsubq_s16(q2s16, q5s16);
*q14s16 = vsubq_s16(q1s16, q6s16);
*q15s16 = vsubq_s16(q0s16, q7s16);
return;
}
void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8_t *d1, *d2;
uint8x8_t d0u8, d1u8, d2u8, d3u8;
uint64x1_t d0u64, d1u64, d2u64, d3u64;
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
q8s16 = vld1q_s16(input);
q9s16 = vld1q_s16(input + 8);
q10s16 = vld1q_s16(input + 16);
q11s16 = vld1q_s16(input + 24);
q12s16 = vld1q_s16(input + 32);
q13s16 = vld1q_s16(input + 40);
q14s16 = vld1q_s16(input + 48);
q15s16 = vld1q_s16(input + 56);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
q8s16 = vrshrq_n_s16(q8s16, 5);
q9s16 = vrshrq_n_s16(q9s16, 5);
q10s16 = vrshrq_n_s16(q10s16, 5);
q11s16 = vrshrq_n_s16(q11s16, 5);
q12s16 = vrshrq_n_s16(q12s16, 5);
q13s16 = vrshrq_n_s16(q13s16, 5);
q14s16 = vrshrq_n_s16(q14s16, 5);
q15s16 = vrshrq_n_s16(q15s16, 5);
d1 = d2 = dest;
d0u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d1u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d2u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
q8s16 = q12s16;
q9s16 = q13s16;
q10s16 = q14s16;
q11s16 = q15s16;
d0u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d1u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d2u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
return;
}
void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) {
uint8_t *d1, *d2;
uint8x8_t d0u8, d1u8, d2u8, d3u8;
int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
int16x4_t d26s16, d27s16, d28s16, d29s16;
uint64x1_t d0u64, d1u64, d2u64, d3u64;
int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
int32x4_t q9s32, q10s32, q11s32, q12s32;
q8s16 = vld1q_s16(input);
q9s16 = vld1q_s16(input + 8);
q10s16 = vld1q_s16(input + 16);
q11s16 = vld1q_s16(input + 24);
q12s16 = vld1q_s16(input + 32);
q13s16 = vld1q_s16(input + 40);
q14s16 = vld1q_s16(input + 48);
q15s16 = vld1q_s16(input + 56);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
// First transform rows
// stage 1
q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
q4s16 = vqrdmulhq_s16(q9s16, q0s16);
q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2);
q7s16 = vqrdmulhq_s16(q9s16, q1s16);
q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2);
q5s16 = vqrdmulhq_s16(q11s16, q0s16);
q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
q6s16 = vqrdmulhq_s16(q11s16, q1s16);
// stage 2 & stage 3 - even half
q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2);
q9s16 = vqrdmulhq_s16(q8s16, q0s16);
q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2);
q13s16 = vqrdmulhq_s16(q10s16, q1s16);
q15s16 = vqrdmulhq_s16(q10s16, q0s16);
// stage 3 -odd half
q0s16 = vaddq_s16(q9s16, q15s16);
q1s16 = vaddq_s16(q9s16, q13s16);
q2s16 = vsubq_s16(q9s16, q13s16);
q3s16 = vsubq_s16(q9s16, q15s16);
// stage 2 - odd half
q13s16 = vsubq_s16(q4s16, q5s16);
q4s16 = vaddq_s16(q4s16, q5s16);
q14s16 = vsubq_s16(q7s16, q6s16);
q7s16 = vaddq_s16(q7s16, q6s16);
d26s16 = vget_low_s16(q13s16);
d27s16 = vget_high_s16(q13s16);
d28s16 = vget_low_s16(q14s16);
d29s16 = vget_high_s16(q14s16);
d16s16 = vdup_n_s16((int16_t)cospi_16_64);
q9s32 = vmull_s16(d28s16, d16s16);
q10s32 = vmull_s16(d29s16, d16s16);
q11s32 = vmull_s16(d28s16, d16s16);
q12s32 = vmull_s16(d29s16, d16s16);
q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
d10s16 = vqrshrn_n_s32(q9s32, 14);
d11s16 = vqrshrn_n_s32(q10s32, 14);
d12s16 = vqrshrn_n_s32(q11s32, 14);
d13s16 = vqrshrn_n_s32(q12s32, 14);
q5s16 = vcombine_s16(d10s16, d11s16);
q6s16 = vcombine_s16(d12s16, d13s16);
// stage 4
q8s16 = vaddq_s16(q0s16, q7s16);
q9s16 = vaddq_s16(q1s16, q6s16);
q10s16 = vaddq_s16(q2s16, q5s16);
q11s16 = vaddq_s16(q3s16, q4s16);
q12s16 = vsubq_s16(q3s16, q4s16);
q13s16 = vsubq_s16(q2s16, q5s16);
q14s16 = vsubq_s16(q1s16, q6s16);
q15s16 = vsubq_s16(q0s16, q7s16);
TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
&q15s16);
q8s16 = vrshrq_n_s16(q8s16, 5);
q9s16 = vrshrq_n_s16(q9s16, 5);
q10s16 = vrshrq_n_s16(q10s16, 5);
q11s16 = vrshrq_n_s16(q11s16, 5);
q12s16 = vrshrq_n_s16(q12s16, 5);
q13s16 = vrshrq_n_s16(q13s16, 5);
q14s16 = vrshrq_n_s16(q14s16, 5);
q15s16 = vrshrq_n_s16(q15s16, 5);
d1 = d2 = dest;
d0u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d1u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d2u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
q8s16 = q12s16;
q9s16 = q13s16;
q10s16 = q14s16;
q11s16 = q15s16;
d0u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d1u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d2u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
d3u64 = vld1_u64((uint64_t *)d1);
d1 += dest_stride;
q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
d2 += dest_stride;
vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
d2 += dest_stride;
return;
}

View File

@@ -1,819 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
//------------------------------------------------------------------------------
// DC 4x4
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
const uint8_t *left, int do_above, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
if (do_above) {
const uint8x8_t A = vld1_u8(above); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
sum_top = vcombine_u16(p1, p1);
}
if (do_left) {
const uint8x8_t L = vld1_u8(left); // left border
const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
const uint16x4_t p1 = vpadd_u16(p0, p0);
sum_left = vcombine_u16(p1, p1);
}
if (do_above && do_left) {
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
dc0 = vrshrn_n_u16(sum, 3);
} else if (do_above) {
dc0 = vrshrn_n_u16(sum_top, 2);
} else if (do_left) {
dc0 = vrshrn_n_u16(sum_left, 2);
} else {
dc0 = vdup_n_u8(0x80);
}
{
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i;
for (i = 0; i < 4; ++i) {
vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
}
}
}
void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
dc_4x4(dst, stride, above, left, 1, 1);
}
void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)above;
dc_4x4(dst, stride, NULL, left, 0, 1);
}
void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
dc_4x4(dst, stride, above, NULL, 1, 0);
}
void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)above;
(void)left;
dc_4x4(dst, stride, NULL, NULL, 0, 0);
}
//------------------------------------------------------------------------------
// DC 8x8
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
const uint8_t *left, int do_above, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
if (do_above) {
const uint8x8_t A = vld1_u8(above); // top row
const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x4_t p2 = vpadd_u16(p1, p1);
sum_top = vcombine_u16(p2, p2);
}
if (do_left) {
const uint8x8_t L = vld1_u8(left); // left border
const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
const uint16x4_t p1 = vpadd_u16(p0, p0);
const uint16x4_t p2 = vpadd_u16(p1, p1);
sum_left = vcombine_u16(p2, p2);
}
if (do_above && do_left) {
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
dc0 = vrshrn_n_u16(sum, 4);
} else if (do_above) {
dc0 = vrshrn_n_u16(sum_top, 3);
} else if (do_left) {
dc0 = vrshrn_n_u16(sum_left, 3);
} else {
dc0 = vdup_n_u8(0x80);
}
{
const uint8x8_t dc = vdup_lane_u8(dc0, 0);
int i;
for (i = 0; i < 8; ++i) {
vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
}
}
}
void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
dc_8x8(dst, stride, above, left, 1, 1);
}
void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)above;
dc_8x8(dst, stride, NULL, left, 0, 1);
}
void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)left;
dc_8x8(dst, stride, above, NULL, 1, 0);
}
void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
(void)above;
(void)left;
dc_8x8(dst, stride, NULL, NULL, 0, 0);
}
//------------------------------------------------------------------------------
// DC 16x16
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left,
int do_above, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
if (do_above) {
const uint8x16_t A = vld1q_u8(above); // top row
const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
const uint16x4_t p2 = vpadd_u16(p1, p1);
const uint16x4_t p3 = vpadd_u16(p2, p2);
sum_top = vcombine_u16(p3, p3);
}
if (do_left) {
const uint8x16_t L = vld1q_u8(left); // left row
const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
const uint16x4_t p2 = vpadd_u16(p1, p1);
const uint16x4_t p3 = vpadd_u16(p2, p2);
sum_left = vcombine_u16(p3, p3);
}
if (do_above && do_left) {
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
dc0 = vrshrn_n_u16(sum, 5);
} else if (do_above) {
dc0 = vrshrn_n_u16(sum_top, 4);
} else if (do_left) {
dc0 = vrshrn_n_u16(sum_left, 4);
} else {
dc0 = vdup_n_u8(0x80);
}
{
const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
int i;
for (i = 0; i < 16; ++i) {
vst1q_u8(dst + i * stride, dc);
}
}
}
void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
dc_16x16(dst, stride, above, left, 1, 1);
}
void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
(void)above;
dc_16x16(dst, stride, NULL, left, 0, 1);
}
void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
(void)left;
dc_16x16(dst, stride, above, NULL, 1, 0);
}
void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
(void)above;
(void)left;
dc_16x16(dst, stride, NULL, NULL, 0, 0);
}
//------------------------------------------------------------------------------
// DC 32x32
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left,
int do_above, int do_left) {
uint16x8_t sum_top;
uint16x8_t sum_left;
uint8x8_t dc0;
if (do_above) {
const uint8x16_t A0 = vld1q_u8(above); // top row
const uint8x16_t A1 = vld1q_u8(above + 16);
const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
const uint16x8_t p1 = vpaddlq_u8(A1);
const uint16x8_t p2 = vaddq_u16(p0, p1);
const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
const uint16x4_t p4 = vpadd_u16(p3, p3);
const uint16x4_t p5 = vpadd_u16(p4, p4);
sum_top = vcombine_u16(p5, p5);
}
if (do_left) {
const uint8x16_t L0 = vld1q_u8(left); // left row
const uint8x16_t L1 = vld1q_u8(left + 16);
const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
const uint16x8_t p1 = vpaddlq_u8(L1);
const uint16x8_t p2 = vaddq_u16(p0, p1);
const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
const uint16x4_t p4 = vpadd_u16(p3, p3);
const uint16x4_t p5 = vpadd_u16(p4, p4);
sum_left = vcombine_u16(p5, p5);
}
if (do_above && do_left) {
const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
dc0 = vrshrn_n_u16(sum, 6);
} else if (do_above) {
dc0 = vrshrn_n_u16(sum_top, 5);
} else if (do_left) {
dc0 = vrshrn_n_u16(sum_left, 5);
} else {
dc0 = vdup_n_u8(0x80);
}
{
const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
int i;
for (i = 0; i < 32; ++i) {
vst1q_u8(dst + i * stride, dc);
vst1q_u8(dst + i * stride + 16, dc);
}
}
}
void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
dc_32x32(dst, stride, above, left, 1, 1);
}
void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
(void)above;
dc_32x32(dst, stride, NULL, left, 0, 1);
}
void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
(void)left;
dc_32x32(dst, stride, above, NULL, 1, 0);
}
void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
(void)above;
(void)left;
dc_32x32(dst, stride, NULL, NULL, 0, 0);
}
// -----------------------------------------------------------------------------
void aom_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row
const uint64x1_t A1 = vshr_n_u64(A0, 8);
const uint64x1_t A2 = vshr_n_u64(A0, 16);
const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
(void)left;
vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
dst[3 * stride + 3] = above[7];
}
void aom_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
const uint8x8_t A0 = vld1_u8(above); // top row
const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
const uint8x8_t avg1 = vhadd_u8(A0, A2);
uint8x8_t row = vrhadd_u8(avg1, A1);
int i;
(void)left;
for (i = 0; i < 7; ++i) {
vst1_u8(dst + i * stride, row);
row = vtbl1_u8(row, sh_12345677);
}
vst1_u8(dst + i * stride, row);
}
void aom_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint8x16_t A0 = vld1q_u8(above); // top row
const uint8x16_t above_right = vld1q_dup_u8(above + 15);
const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
const uint8x16_t avg1 = vhaddq_u8(A0, A2);
uint8x16_t row = vrhaddq_u8(avg1, A1);
int i;
(void)left;
for (i = 0; i < 15; ++i) {
vst1q_u8(dst + i * stride, row);
row = vextq_u8(row, above_right, 1);
}
vst1q_u8(dst + i * stride, row);
}
// -----------------------------------------------------------------------------
void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
const uint32x2_t zero = vdup_n_u32(0);
const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
const uint8_t D = vget_lane_u8(XABCD_u8, 4);
const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
}
#if !HAVE_NEON_ASM
void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int i;
uint32x2_t d0u32 = vdup_n_u32(0);
(void)left;
d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
for (i = 0; i < 4; i++, dst += stride)
vst1_lane_u32((uint32_t *)dst, d0u32, 0);
}
void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int i;
uint8x8_t d0u8 = vdup_n_u8(0);
(void)left;
d0u8 = vld1_u8(above);
for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
}
void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int i;
uint8x16_t q0u8 = vdupq_n_u8(0);
(void)left;
q0u8 = vld1q_u8(above);
for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
}
void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int i;
uint8x16_t q0u8 = vdupq_n_u8(0);
uint8x16_t q1u8 = vdupq_n_u8(0);
(void)left;
q0u8 = vld1q_u8(above);
q1u8 = vld1q_u8(above + 16);
for (i = 0; i < 32; i++, dst += stride) {
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q1u8);
}
}
void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
uint8x8_t d0u8 = vdup_n_u8(0);
uint32x2_t d1u32 = vdup_n_u32(0);
(void)above;
d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}
void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
uint8x8_t d0u8 = vdup_n_u8(0);
uint64x1_t d1u64 = vdup_n_u64(0);
(void)above;
d1u64 = vld1_u64((const uint64_t *)left);
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
vst1_u8(dst, d0u8);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
vst1_u8(dst, d0u8);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
vst1_u8(dst, d0u8);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
vst1_u8(dst, d0u8);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
vst1_u8(dst, d0u8);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
vst1_u8(dst, d0u8);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
vst1_u8(dst, d0u8);
dst += stride;
d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
vst1_u8(dst, d0u8);
}
void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j;
uint8x8_t d2u8 = vdup_n_u8(0);
uint8x16_t q0u8 = vdupq_n_u8(0);
uint8x16_t q1u8 = vdupq_n_u8(0);
(void)above;
q1u8 = vld1q_u8(left);
d2u8 = vget_low_u8(q1u8);
for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
q0u8 = vdupq_lane_u8(d2u8, 0);
vst1q_u8(dst, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 1);
vst1q_u8(dst, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 2);
vst1q_u8(dst, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 3);
vst1q_u8(dst, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 4);
vst1q_u8(dst, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 5);
vst1q_u8(dst, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 6);
vst1q_u8(dst, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 7);
vst1q_u8(dst, q0u8);
dst += stride;
}
}
void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j, k;
uint8x8_t d2u8 = vdup_n_u8(0);
uint8x16_t q0u8 = vdupq_n_u8(0);
uint8x16_t q1u8 = vdupq_n_u8(0);
(void)above;
for (k = 0; k < 2; k++, left += 16) {
q1u8 = vld1q_u8(left);
d2u8 = vget_low_u8(q1u8);
for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
q0u8 = vdupq_lane_u8(d2u8, 0);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 1);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 2);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 3);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 4);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 5);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 6);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
q0u8 = vdupq_lane_u8(d2u8, 7);
vst1q_u8(dst, q0u8);
vst1q_u8(dst + 16, q0u8);
dst += stride;
}
}
}
void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int i;
uint16x8_t q1u16, q3u16;
int16x8_t q1s16;
uint8x8_t d0u8 = vdup_n_u8(0);
uint32x2_t d2u32 = vdup_n_u32(0);
d0u8 = vld1_dup_u8(above - 1);
d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
for (i = 0; i < 4; i++, dst += stride) {
q1u16 = vdupq_n_u16((uint16_t)left[i]);
q1s16 =
vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
d0u8 = vqmovun_s16(q1s16);
vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}
}
void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j;
uint16x8_t q0u16, q3u16, q10u16;
int16x8_t q0s16;
uint16x4_t d20u16;
uint8x8_t d0u8, d2u8, d30u8;
d0u8 = vld1_dup_u8(above - 1);
d30u8 = vld1_u8(left);
d2u8 = vld1_u8(above);
q10u16 = vmovl_u8(d30u8);
q3u16 = vsubl_u8(d2u8, d0u8);
d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 1);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 2);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 3);
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
d0u8 = vqmovun_s16(q0s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
dst += stride;
}
}
void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j, k;
uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
uint8x16_t q0u8, q1u8;
int16x8_t q0s16, q1s16, q8s16, q11s16;
uint16x4_t d20u16;
uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
q0u8 = vld1q_dup_u8(above - 1);
q1u8 = vld1q_u8(above);
q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
for (k = 0; k < 2; k++, left += 8) {
d18u8 = vld1_u8(left);
q10u16 = vmovl_u8(d18u8);
d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
q0u16 = vdupq_lane_u16(d20u16, 0);
q8u16 = vdupq_lane_u16(d20u16, 1);
q1s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
q11s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
q8s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16);
d23u8 = vqmovun_s16(q8s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
dst += stride;
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
dst += stride;
q0u16 = vdupq_lane_u16(d20u16, 2);
q8u16 = vdupq_lane_u16(d20u16, 3);
q1s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
q0s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
q11s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
q8s16 =
vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
d2u8 = vqmovun_s16(q1s16);
d3u8 = vqmovun_s16(q0s16);
d22u8 = vqmovun_s16(q11s16);
d23u8 = vqmovun_s16(q8s16);
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
dst += stride;
vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
dst += stride;
}
}
}
void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int j, k;
uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
uint8x16_t q0u8, q1u8, q2u8;
int16x8_t q12s16, q13s16, q14s16, q15s16;
uint16x4_t d6u16;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
q0u8 = vld1q_dup_u8(above - 1);
q1u8 = vld1q_u8(above);
q2u8 = vld1q_u8(above + 16);
q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
for (k = 0; k < 4; k++, left += 8) {
d26u8 = vld1_u8(left);
q3u16 = vmovl_u8(d26u8);
d6u16 = vget_low_u16(q3u16);
for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
q0u16 = vdupq_lane_u16(d6u16, 0);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 1);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 2);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
q0u16 = vdupq_lane_u16(d6u16, 3);
q12s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
q13s16 =
vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q10u16));
q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
vreinterpretq_s16_u16(q11u16));
d0u8 = vqmovun_s16(q12s16);
d1u8 = vqmovun_s16(q13s16);
d2u8 = vqmovun_s16(q14s16);
d3u8 = vqmovun_s16(q15s16);
q0u8 = vcombine_u8(d0u8, d1u8);
q1u8 = vcombine_u8(d2u8, d3u8);
vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
dst += stride;
}
}
}
#endif // !HAVE_NEON_ASM

View File

@@ -1,633 +0,0 @@
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
;
EXPORT |aom_v_predictor_4x4_neon|
EXPORT |aom_v_predictor_8x8_neon|
EXPORT |aom_v_predictor_16x16_neon|
EXPORT |aom_v_predictor_32x32_neon|
EXPORT |aom_h_predictor_4x4_neon|
EXPORT |aom_h_predictor_8x8_neon|
EXPORT |aom_h_predictor_16x16_neon|
EXPORT |aom_h_predictor_32x32_neon|
EXPORT |aom_tm_predictor_4x4_neon|
EXPORT |aom_tm_predictor_8x8_neon|
EXPORT |aom_tm_predictor_16x16_neon|
EXPORT |aom_tm_predictor_32x32_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_v_predictor_4x4_neon| PROC
vld1.32 {d0[0]}, [r2]
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
bx lr
ENDP ; |aom_v_predictor_4x4_neon|
;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_v_predictor_8x8_neon| PROC
vld1.8 {d0}, [r2]
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
bx lr
ENDP ; |aom_v_predictor_8x8_neon|
;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_v_predictor_16x16_neon| PROC
vld1.8 {q0}, [r2]
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
vst1.8 {q0}, [r0], r1
bx lr
ENDP ; |aom_v_predictor_16x16_neon|
;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_v_predictor_32x32_neon| PROC
vld1.8 {q0, q1}, [r2]
mov r2, #2
loop_v
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
vst1.8 {q0, q1}, [r0], r1
subs r2, r2, #1
bgt loop_v
bx lr
ENDP ; |aom_v_predictor_32x32_neon|
;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_h_predictor_4x4_neon| PROC
vld1.32 {d1[0]}, [r3]
vdup.8 d0, d1[0]
vst1.32 {d0[0]}, [r0], r1
vdup.8 d0, d1[1]
vst1.32 {d0[0]}, [r0], r1
vdup.8 d0, d1[2]
vst1.32 {d0[0]}, [r0], r1
vdup.8 d0, d1[3]
vst1.32 {d0[0]}, [r0], r1
bx lr
ENDP ; |aom_h_predictor_4x4_neon|
;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_h_predictor_8x8_neon| PROC
vld1.64 {d1}, [r3]
vdup.8 d0, d1[0]
vst1.64 {d0}, [r0], r1
vdup.8 d0, d1[1]
vst1.64 {d0}, [r0], r1
vdup.8 d0, d1[2]
vst1.64 {d0}, [r0], r1
vdup.8 d0, d1[3]
vst1.64 {d0}, [r0], r1
vdup.8 d0, d1[4]
vst1.64 {d0}, [r0], r1
vdup.8 d0, d1[5]
vst1.64 {d0}, [r0], r1
vdup.8 d0, d1[6]
vst1.64 {d0}, [r0], r1
vdup.8 d0, d1[7]
vst1.64 {d0}, [r0], r1
bx lr
ENDP ; |aom_h_predictor_8x8_neon|
;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_h_predictor_16x16_neon| PROC
vld1.8 {q1}, [r3]
vdup.8 q0, d2[0]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[1]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[2]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[3]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[4]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[5]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[6]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[7]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[0]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[1]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[2]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[3]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[4]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[5]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[6]
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[7]
vst1.8 {q0}, [r0], r1
bx lr
ENDP ; |aom_h_predictor_16x16_neon|
;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_h_predictor_32x32_neon| PROC
sub r1, r1, #16
mov r2, #2
loop_h
vld1.8 {q1}, [r3]!
vdup.8 q0, d2[0]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[1]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[2]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[3]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[4]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[5]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[6]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d2[7]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[0]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[1]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[2]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[3]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[4]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[5]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[6]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
vdup.8 q0, d3[7]
vst1.8 {q0}, [r0]!
vst1.8 {q0}, [r0], r1
subs r2, r2, #1
bgt loop_h
bx lr
ENDP ; |aom_h_predictor_32x32_neon|
;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_tm_predictor_4x4_neon| PROC
; Load ytop_left = above[-1];
sub r12, r2, #1
vld1.u8 {d0[]}, [r12]
; Load above 4 pixels
vld1.32 {d2[0]}, [r2]
; Compute above - ytop_left
vsubl.u8 q3, d2, d0
; Load left row by row and compute left + (above - ytop_left)
; 1st row and 2nd row
vld1.u8 {d2[]}, [r3]!
vld1.u8 {d4[]}, [r3]!
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vadd.s16 q1, q1, q3
vadd.s16 q2, q2, q3
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
; 3rd row and 4th row
vld1.u8 {d2[]}, [r3]!
vld1.u8 {d4[]}, [r3]
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vadd.s16 q1, q1, q3
vadd.s16 q2, q2, q3
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
bx lr
ENDP ; |aom_tm_predictor_4x4_neon|
;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_tm_predictor_8x8_neon| PROC
; Load ytop_left = above[-1];
sub r12, r2, #1
vld1.8 {d0[]}, [r12]
; preload 8 left
vld1.8 {d30}, [r3]
; Load above 8 pixels
vld1.64 {d2}, [r2]
vmovl.u8 q10, d30
; Compute above - ytop_left
vsubl.u8 q3, d2, d0
; Load left row by row and compute left + (above - ytop_left)
; 1st row and 2nd row
vdup.16 q0, d20[0]
vdup.16 q1, d20[1]
vadd.s16 q0, q3, q0
vadd.s16 q1, q3, q1
; 3rd row and 4th row
vdup.16 q8, d20[2]
vdup.16 q9, d20[3]
vadd.s16 q8, q3, q8
vadd.s16 q9, q3, q9
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vqmovun.s16 d2, q8
vqmovun.s16 d3, q9
vst1.64 {d0}, [r0], r1
vst1.64 {d1}, [r0], r1
vst1.64 {d2}, [r0], r1
vst1.64 {d3}, [r0], r1
; 5th row and 6th row
vdup.16 q0, d21[0]
vdup.16 q1, d21[1]
vadd.s16 q0, q3, q0
vadd.s16 q1, q3, q1
; 7th row and 8th row
vdup.16 q8, d21[2]
vdup.16 q9, d21[3]
vadd.s16 q8, q3, q8
vadd.s16 q9, q3, q9
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vqmovun.s16 d2, q8
vqmovun.s16 d3, q9
vst1.64 {d0}, [r0], r1
vst1.64 {d1}, [r0], r1
vst1.64 {d2}, [r0], r1
vst1.64 {d3}, [r0], r1
bx lr
ENDP ; |aom_tm_predictor_8x8_neon|
;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_tm_predictor_16x16_neon| PROC
; Load ytop_left = above[-1];
sub r12, r2, #1
vld1.8 {d0[]}, [r12]
; Load above 8 pixels
vld1.8 {q1}, [r2]
; preload 8 left into r12
vld1.8 {d18}, [r3]!
; Compute above - ytop_left
vsubl.u8 q2, d2, d0
vsubl.u8 q3, d3, d0
vmovl.u8 q10, d18
; Load left row by row and compute left + (above - ytop_left)
; Process 8 rows in each single loop and loop 2 times to process 16 rows.
mov r2, #2
loop_16x16_neon
; Process two rows.
vdup.16 q0, d20[0]
vdup.16 q8, d20[1]
vadd.s16 q1, q0, q2
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
vqmovun.s16 d2, q1
vqmovun.s16 d3, q0
vqmovun.s16 d22, q11
vqmovun.s16 d23, q8
vdup.16 q0, d20[2] ; proload next 2 rows data
vdup.16 q8, d20[3]
vst1.64 {d2,d3}, [r0], r1
vst1.64 {d22,d23}, [r0], r1
; Process two rows.
vadd.s16 q1, q0, q2
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
vqmovun.s16 d2, q1
vqmovun.s16 d3, q0
vqmovun.s16 d22, q11
vqmovun.s16 d23, q8
vdup.16 q0, d21[0] ; proload next 2 rows data
vdup.16 q8, d21[1]
vst1.64 {d2,d3}, [r0], r1
vst1.64 {d22,d23}, [r0], r1
vadd.s16 q1, q0, q2
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
vqmovun.s16 d2, q1
vqmovun.s16 d3, q0
vqmovun.s16 d22, q11
vqmovun.s16 d23, q8
vdup.16 q0, d21[2] ; proload next 2 rows data
vdup.16 q8, d21[3]
vst1.64 {d2,d3}, [r0], r1
vst1.64 {d22,d23}, [r0], r1
vadd.s16 q1, q0, q2
vadd.s16 q0, q0, q3
vadd.s16 q11, q8, q2
vadd.s16 q8, q8, q3
vqmovun.s16 d2, q1
vqmovun.s16 d3, q0
vqmovun.s16 d22, q11
vqmovun.s16 d23, q8
vld1.8 {d18}, [r3]! ; preload 8 left into r12
vmovl.u8 q10, d18
vst1.64 {d2,d3}, [r0], r1
vst1.64 {d22,d23}, [r0], r1
subs r2, r2, #1
bgt loop_16x16_neon
bx lr
ENDP ; |aom_tm_predictor_16x16_neon|
;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
; const uint8_t *above,
; const uint8_t *left)
; r0 uint8_t *dst
; r1 ptrdiff_t y_stride
; r2 const uint8_t *above
; r3 const uint8_t *left
|aom_tm_predictor_32x32_neon| PROC
; Load ytop_left = above[-1];
sub r12, r2, #1
vld1.8 {d0[]}, [r12]
; Load above 32 pixels
vld1.8 {q1}, [r2]!
vld1.8 {q2}, [r2]
; preload 8 left pixels
vld1.8 {d26}, [r3]!
; Compute above - ytop_left
vsubl.u8 q8, d2, d0
vsubl.u8 q9, d3, d0
vsubl.u8 q10, d4, d0
vsubl.u8 q11, d5, d0
vmovl.u8 q3, d26
; Load left row by row and compute left + (above - ytop_left)
; Process 8 rows in each single loop and loop 4 times to process 32 rows.
mov r2, #4
loop_32x32_neon
; Process two rows.
vdup.16 q0, d6[0]
vdup.16 q2, d6[1]
vadd.s16 q12, q0, q8
vadd.s16 q13, q0, q9
vadd.s16 q14, q0, q10
vadd.s16 q15, q0, q11
vqmovun.s16 d0, q12
vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
vqmovun.s16 d2, q14
vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vqmovun.s16 d26, q14
vqmovun.s16 d27, q15
vdup.16 q1, d6[2]
vdup.16 q2, d6[3]
vst1.64 {d24-d27}, [r0], r1
; Process two rows.
vadd.s16 q12, q1, q8
vadd.s16 q13, q1, q9
vadd.s16 q14, q1, q10
vadd.s16 q15, q1, q11
vqmovun.s16 d0, q12
vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
vqmovun.s16 d2, q14
vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vqmovun.s16 d26, q14
vqmovun.s16 d27, q15
vdup.16 q0, d7[0]
vdup.16 q2, d7[1]
vst1.64 {d24-d27}, [r0], r1
; Process two rows.
vadd.s16 q12, q0, q8
vadd.s16 q13, q0, q9
vadd.s16 q14, q0, q10
vadd.s16 q15, q0, q11
vqmovun.s16 d0, q12
vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
vqmovun.s16 d2, q14
vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vqmovun.s16 d26, q14
vqmovun.s16 d27, q15
vdup.16 q0, d7[2]
vdup.16 q2, d7[3]
vst1.64 {d24-d27}, [r0], r1
; Process two rows.
vadd.s16 q12, q0, q8
vadd.s16 q13, q0, q9
vadd.s16 q14, q0, q10
vadd.s16 q15, q0, q11
vqmovun.s16 d0, q12
vqmovun.s16 d1, q13
vadd.s16 q12, q2, q8
vadd.s16 q13, q2, q9
vqmovun.s16 d2, q14
vqmovun.s16 d3, q15
vadd.s16 q14, q2, q10
vadd.s16 q15, q2, q11
vst1.64 {d0-d3}, [r0], r1
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vld1.8 {d0}, [r3]! ; preload 8 left pixels
vqmovun.s16 d26, q14
vqmovun.s16 d27, q15
vmovl.u8 q3, d0
vst1.64 {d24-d27}, [r0], r1
subs r2, r2, #1
bgt loop_32x32_neon
bx lr
ENDP ; |aom_tm_predictor_32x32_neon|
END

View File

@@ -1,174 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
#include "./aom_config.h"
#include "aom/aom_integer.h"
static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p3
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r) { // q1
uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q2s16, q11s16;
uint16x8_t q4u16;
int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
int8x8_t d2s8, d3s8;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q3 = vabdq_u8(q9, q8);
q4 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q3 = vmaxq_u8(q3, q4);
q15u8 = vmaxq_u8(q11u8, q12u8);
q9 = vabdq_u8(q6, q7);
// aom_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q3);
q2u8 = vabdq_u8(q5, q8);
q9 = vqaddq_u8(q9, q9);
q15u8 = vcgeq_u8(qlimit, q15u8);
// aom_filter() function
// convert to signed
q10 = vdupq_n_u8(0x80);
q8 = veorq_u8(q8, q10);
q7 = veorq_u8(q7, q10);
q6 = veorq_u8(q6, q10);
q5 = veorq_u8(q5, q10);
q2u8 = vshrq_n_u8(q2u8, 1);
q9 = vqaddq_u8(q9, q2u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q9 = vcgeq_u8(qblimit, q9);
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
q14u8 = vorrq_u8(q13u8, q14u8);
q4u16 = vdupq_n_u16(3);
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
q15u8 = vandq_u8(q15u8, q9);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
q4 = vdupq_n_u8(3);
q9 = vdupq_n_u8(4);
// aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0))
d2s8 = vqmovn_s16(q2s16);
d3s8 = vqmovn_s16(q11s16);
q1s8 = vcombine_s8(d2s8, d3s8);
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
q2s8 = vshrq_n_s8(q2s8, 3);
q1s8 = vshrq_n_s8(q1s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
q1s8 = vrshrq_n_s8(q1s8, 1);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
*q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10);
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
return;
}
void aom_lpf_horizontal_4_dual_neon(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
uint8x16_t qblimit, qlimit, qthresh;
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
dblimit0 = vld1_u8(blimit0);
dlimit0 = vld1_u8(limit0);
dthresh0 = vld1_u8(thresh0);
dblimit1 = vld1_u8(blimit1);
dlimit1 = vld1_u8(limit1);
dthresh1 = vld1_u8(thresh1);
qblimit = vcombine_u8(dblimit0, dblimit1);
qlimit = vcombine_u8(dlimit0, dlimit1);
qthresh = vcombine_u8(dthresh0, dthresh1);
s -= (p << 2);
q3u8 = vld1q_u8(s);
s += p;
q4u8 = vld1q_u8(s);
s += p;
q5u8 = vld1q_u8(s);
s += p;
q6u8 = vld1q_u8(s);
s += p;
q7u8 = vld1q_u8(s);
s += p;
q8u8 = vld1q_u8(s);
s += p;
q9u8 = vld1q_u8(s);
s += p;
q10u8 = vld1q_u8(s);
loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8,
q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8);
s -= (p * 5);
vst1q_u8(s, q5u8);
s += p;
vst1q_u8(s, q6u8);
s += p;
vst1q_u8(s, q7u8);
s += p;
vst1q_u8(s, q8u8);
return;
}

View File

@@ -1,252 +0,0 @@
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
;
EXPORT |aom_lpf_horizontal_4_neon|
EXPORT |aom_lpf_vertical_4_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; Currently aom only works on iterations 8 at a time. The aom loop filter
; works on 16 iterations at a time.
;
; void aom_lpf_horizontal_4_neon(uint8_t *s,
; int p /* pitch */,
; const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh)
;
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
|aom_lpf_horizontal_4_neon| PROC
push {lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
ldr r2, [sp, #4] ; load thresh
add r1, r1, r1 ; double pitch
vld1.8 {d1[]}, [r3] ; duplicate *limit
vld1.8 {d2[]}, [r2] ; duplicate *thresh
sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
add r3, r2, r1, lsr #1 ; set to 3 lines down
vld1.u8 {d3}, [r2@64], r1 ; p3
vld1.u8 {d4}, [r3@64], r1 ; p2
vld1.u8 {d5}, [r2@64], r1 ; p1
vld1.u8 {d6}, [r3@64], r1 ; p0
vld1.u8 {d7}, [r2@64], r1 ; q0
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r2@64] ; q2
vld1.u8 {d18}, [r3@64] ; q3
sub r2, r2, r1, lsl #1
sub r3, r3, r1, lsl #1
bl aom_loop_filter_neon
vst1.u8 {d4}, [r2@64], r1 ; store op1
vst1.u8 {d5}, [r3@64], r1 ; store op0
vst1.u8 {d6}, [r2@64], r1 ; store oq0
vst1.u8 {d7}, [r3@64], r1 ; store oq1
pop {pc}
ENDP ; |aom_lpf_horizontal_4_neon|
; Currently aom only works on iterations 8 at a time. The aom loop filter
; works on 16 iterations at a time.
;
; void aom_lpf_vertical_4_neon(uint8_t *s,
; int p /* pitch */,
; const uint8_t *blimit,
; const uint8_t *limit,
; const uint8_t *thresh)
;
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
|aom_lpf_vertical_4_neon| PROC
push {lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
vld1.8 {d1[]}, [r3] ; duplicate *limit
ldr r3, [sp, #4] ; load thresh
sub r2, r0, #4 ; move s pointer down by 4 columns
vld1.8 {d2[]}, [r3] ; duplicate *thresh
vld1.u8 {d3}, [r2], r1 ; load s data
vld1.u8 {d4}, [r2], r1
vld1.u8 {d5}, [r2], r1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d7}, [r2], r1
vld1.u8 {d16}, [r2], r1
vld1.u8 {d17}, [r2], r1
vld1.u8 {d18}, [r2]
;transpose to 8x16 matrix
vtrn.32 d3, d7
vtrn.32 d4, d16
vtrn.32 d5, d17
vtrn.32 d6, d18
vtrn.16 d3, d5
vtrn.16 d4, d6
vtrn.16 d7, d17
vtrn.16 d16, d18
vtrn.8 d3, d4
vtrn.8 d5, d6
vtrn.8 d7, d16
vtrn.8 d17, d18
bl aom_loop_filter_neon
sub r0, r0, #2
;store op1, op0, oq0, oq1
vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
pop {pc}
ENDP ; |aom_lpf_vertical_4_neon|
; void aom_loop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store. The function does not use
; registers d8-d15.
;
; Inputs:
; r0-r3, r12 PRESERVE
; d0 blimit
; d1 limit
; d2 thresh
; d3 p3
; d4 p2
; d5 p1
; d6 p0
; d7 q0
; d16 q1
; d17 q2
; d18 q3
;
; Outputs:
; d4 op1
; d5 op0
; d6 oq0
; d7 oq1
|aom_loop_filter_neon| PROC
; filter_mask
vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
; only compare the largest value to limit
vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
vabd.u8 d17, d6, d7 ; abs(p0 - q0)
vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
vmov.u8 d18, #0x80
vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
; hevmask
vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
veor d7, d7, d18 ; qs0
vcge.u8 d23, d1, d23 ; abs(m1) > limit
; filter() function
; convert to signed
vshr.u8 d28, d28, #1 ; a = a / 2
veor d6, d6, d18 ; ps0
veor d5, d5, d18 ; ps1
vqadd.u8 d17, d17, d28 ; a = b + a
veor d16, d16, d18 ; qs1
vmov.u8 d19, #3
vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
vcge.u8 d17, d0, d17 ; a > blimit
vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
vorr d22, d21, d22 ; hevmask
vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
vand d27, d27, d22 ; filter &= hev
vand d23, d23, d17 ; filter_mask
vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
vmov.u8 d17, #4
; filter = clamp(filter + 3 * ( qs0 - ps0))
vqmovn.s16 d27, q12
vand d27, d27, d23 ; filter &= mask
vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
vshr.s8 d28, d28, #3 ; filter2 >>= 3
vshr.s8 d27, d27, #3 ; filter1 >>= 3
vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
; outer tap adjustments
vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
veor d6, d26, d18 ; *oq0 = u^0x80
vbic d27, d27, d22 ; filter &= ~hev
vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
veor d5, d19, d18 ; *op0 = u^0x80
veor d4, d21, d18 ; *op1 = u^0x80
veor d7, d20, d18 ; *oq1 = u^0x80
bx lr
ENDP ; |aom_loop_filter_neon|
END

View File

@@ -1,250 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit
uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh
uint8x8_t d3u8, // p3
uint8x8_t d4u8, // p2
uint8x8_t d5u8, // p1
uint8x8_t d6u8, // p0
uint8x8_t d7u8, // q0
uint8x8_t d16u8, // q1
uint8x8_t d17u8, // q2
uint8x8_t d18u8, // q3
uint8x8_t *d4ru8, // p1
uint8x8_t *d5ru8, // p0
uint8x8_t *d6ru8, // q0
uint8x8_t *d7ru8) { // q1
uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
int16x8_t q12s16;
int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
d19u8 = vabd_u8(d3u8, d4u8);
d20u8 = vabd_u8(d4u8, d5u8);
d21u8 = vabd_u8(d5u8, d6u8);
d22u8 = vabd_u8(d16u8, d7u8);
d3u8 = vabd_u8(d17u8, d16u8);
d4u8 = vabd_u8(d18u8, d17u8);
d19u8 = vmax_u8(d19u8, d20u8);
d20u8 = vmax_u8(d21u8, d22u8);
d3u8 = vmax_u8(d3u8, d4u8);
d23u8 = vmax_u8(d19u8, d20u8);
d17u8 = vabd_u8(d6u8, d7u8);
d21u8 = vcgt_u8(d21u8, dthresh);
d22u8 = vcgt_u8(d22u8, dthresh);
d23u8 = vmax_u8(d23u8, d3u8);
d28u8 = vabd_u8(d5u8, d16u8);
d17u8 = vqadd_u8(d17u8, d17u8);
d23u8 = vcge_u8(dlimit, d23u8);
d18u8 = vdup_n_u8(0x80);
d5u8 = veor_u8(d5u8, d18u8);
d6u8 = veor_u8(d6u8, d18u8);
d7u8 = veor_u8(d7u8, d18u8);
d16u8 = veor_u8(d16u8, d18u8);
d28u8 = vshr_n_u8(d28u8, 1);
d17u8 = vqadd_u8(d17u8, d28u8);
d19u8 = vdup_n_u8(3);
d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8));
d17u8 = vcge_u8(dblimit, d17u8);
d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8));
d22u8 = vorr_u8(d21u8, d22u8);
q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
d23u8 = vand_u8(d23u8, d17u8);
q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
d17u8 = vdup_n_u8(4);
d27s8 = vqmovn_s16(q12s16);
d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
d27s8 = vreinterpret_s8_u8(d27u8);
d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
d28s8 = vshr_n_s8(d28s8, 3);
d27s8 = vshr_n_s8(d27s8, 3);
d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
d27s8 = vrshr_n_s8(d27s8, 1);
d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
*d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
*d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
*d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
*d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
return;
}
void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
psrc = src - (pitch << 2);
for (i = 0; i < 1; i++) {
s = psrc + i * 8;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
s -= (pitch * 5);
vst1_u8(s, d4u8);
s += pitch;
vst1_u8(s, d5u8);
s += pitch;
vst1_u8(s, d6u8);
s += pitch;
vst1_u8(s, d7u8);
}
return;
}
void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i, pitch8;
uint8_t *s;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
uint8x8x4_t d4Result;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
pitch8 = pitch * 8;
for (i = 0; i < 1; i++, src += pitch8) {
s = src - (i + 1) * 4;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
vreinterpret_u16_u32(d2tmp2.val[0]));
d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
vreinterpret_u16_u32(d2tmp3.val[0]));
d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
vreinterpret_u16_u32(d2tmp2.val[1]));
d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
vreinterpret_u16_u32(d2tmp3.val[1]));
d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
vreinterpret_u8_u16(d2tmp5.val[0]));
d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
vreinterpret_u8_u16(d2tmp5.val[1]));
d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
vreinterpret_u8_u16(d2tmp7.val[0]));
d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
vreinterpret_u8_u16(d2tmp7.val[1]));
d3u8 = d2tmp8.val[0];
d4u8 = d2tmp8.val[1];
d5u8 = d2tmp9.val[0];
d6u8 = d2tmp9.val[1];
d7u8 = d2tmp10.val[0];
d16u8 = d2tmp10.val[1];
d17u8 = d2tmp11.val[0];
d18u8 = d2tmp11.val[1];
loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8);
d4Result.val[0] = d4u8;
d4Result.val[1] = d5u8;
d4Result.val[2] = d6u8;
d4Result.val[3] = d7u8;
src -= 2;
vst4_lane_u8(src, d4Result, 0);
src += pitch;
vst4_lane_u8(src, d4Result, 1);
src += pitch;
vst4_lane_u8(src, d4Result, 2);
src += pitch;
vst4_lane_u8(src, d4Result, 3);
src += pitch;
vst4_lane_u8(src, d4Result, 4);
src += pitch;
vst4_lane_u8(src, d4Result, 5);
src += pitch;
vst4_lane_u8(src, d4Result, 6);
src += pitch;
vst4_lane_u8(src, d4Result, 7);
}
return;
}

View File

@@ -1,430 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit
uint8x8_t dlimit, // limit
uint8x8_t dthresh, // thresh
uint8x8_t d3u8, // p2
uint8x8_t d4u8, // p2
uint8x8_t d5u8, // p1
uint8x8_t d6u8, // p0
uint8x8_t d7u8, // q0
uint8x8_t d16u8, // q1
uint8x8_t d17u8, // q2
uint8x8_t d18u8, // q3
uint8x8_t *d0ru8, // p1
uint8x8_t *d1ru8, // p1
uint8x8_t *d2ru8, // p0
uint8x8_t *d3ru8, // q0
uint8x8_t *d4ru8, // q1
uint8x8_t *d5ru8) { // q1
uint32_t flat;
uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
int16x8_t q15s16;
uint16x8_t q10u16, q14u16;
int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
d19u8 = vabd_u8(d3u8, d4u8);
d20u8 = vabd_u8(d4u8, d5u8);
d21u8 = vabd_u8(d5u8, d6u8);
d22u8 = vabd_u8(d16u8, d7u8);
d23u8 = vabd_u8(d17u8, d16u8);
d24u8 = vabd_u8(d18u8, d17u8);
d19u8 = vmax_u8(d19u8, d20u8);
d20u8 = vmax_u8(d21u8, d22u8);
d25u8 = vabd_u8(d6u8, d4u8);
d23u8 = vmax_u8(d23u8, d24u8);
d26u8 = vabd_u8(d7u8, d17u8);
d19u8 = vmax_u8(d19u8, d20u8);
d24u8 = vabd_u8(d6u8, d7u8);
d27u8 = vabd_u8(d3u8, d6u8);
d28u8 = vabd_u8(d18u8, d7u8);
d19u8 = vmax_u8(d19u8, d23u8);
d23u8 = vabd_u8(d5u8, d16u8);
d24u8 = vqadd_u8(d24u8, d24u8);
d19u8 = vcge_u8(dlimit, d19u8);
d25u8 = vmax_u8(d25u8, d26u8);
d26u8 = vmax_u8(d27u8, d28u8);
d23u8 = vshr_n_u8(d23u8, 1);
d25u8 = vmax_u8(d25u8, d26u8);
d24u8 = vqadd_u8(d24u8, d23u8);
d20u8 = vmax_u8(d20u8, d25u8);
d23u8 = vdup_n_u8(1);
d24u8 = vcge_u8(dblimit, d24u8);
d21u8 = vcgt_u8(d21u8, dthresh);
d20u8 = vcge_u8(d23u8, d20u8);
d19u8 = vand_u8(d19u8, d24u8);
d23u8 = vcgt_u8(d22u8, dthresh);
d20u8 = vand_u8(d20u8, d19u8);
d22u8 = vdup_n_u8(0x80);
d23u8 = vorr_u8(d21u8, d23u8);
q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));
d30u8 = vshrn_n_u16(q10u16, 4);
flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
d27u8 = vdup_n_u8(3);
d21u8 = vdup_n_u8(2);
q14u16 = vaddl_u8(d6u8, d7u8);
q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
q14u16 = vaddw_u8(q14u16, d5u8);
*d0ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vaddw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d16u8);
*d1ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d17u8);
*d2ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d7u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d3ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vsubw_u8(q14u16, d7u8);
q14u16 = vaddw_u8(q14u16, d16u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d4ru8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vsubw_u8(q14u16, d16u8);
q14u16 = vaddw_u8(q14u16, d17u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d5ru8 = vqrshrn_n_u16(q14u16, 3);
} else {
d21u8 = veor_u8(d7u8, d22u8);
d24u8 = veor_u8(d6u8, d22u8);
d25u8 = veor_u8(d5u8, d22u8);
d26u8 = veor_u8(d16u8, d22u8);
d27u8 = vdup_n_u8(3);
d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
q15s16 = vaddw_s8(q15s16, d29s8);
d29u8 = vdup_n_u8(4);
d28s8 = vqmovn_s16(q15s16);
d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
d30s8 = vshr_n_s8(d30s8, 3);
d29s8 = vshr_n_s8(d29s8, 3);
d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
d29s8 = vrshr_n_s8(d29s8, 1);
d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
if (flat == 0) { // filter_branch_only
*d0ru8 = d4u8;
*d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
*d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
*d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
*d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
*d5ru8 = d17u8;
return;
}
d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
d23u8 = vdup_n_u8(2);
q14u16 = vaddl_u8(d6u8, d7u8);
q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
q14u16 = vaddw_u8(q14u16, d5u8);
d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
d30u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vaddw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d16u8);
d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
d31u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vaddw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d17u8);
*d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
d23u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d3u8);
q14u16 = vsubw_u8(q14u16, d6u8);
q14u16 = vaddw_u8(q14u16, d7u8);
*d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
q14u16 = vaddw_u8(q14u16, d18u8);
*d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
d22u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d4u8);
q14u16 = vsubw_u8(q14u16, d7u8);
q14u16 = vaddw_u8(q14u16, d16u8);
d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
q14u16 = vaddw_u8(q14u16, d18u8);
d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
d6u8 = vqrshrn_n_u16(q14u16, 3);
q14u16 = vsubw_u8(q14u16, d5u8);
q14u16 = vsubw_u8(q14u16, d16u8);
q14u16 = vaddw_u8(q14u16, d17u8);
q14u16 = vaddw_u8(q14u16, d18u8);
d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
d7u8 = vqrshrn_n_u16(q14u16, 3);
*d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
*d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
*d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
}
return;
}
void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
uint8_t *s, *psrc;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint8x8_t d16u8, d17u8, d18u8;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
psrc = src - (pitch << 2);
for (i = 0; i < 1; i++) {
s = psrc + i * 8;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
&d5u8);
s -= (pitch * 6);
vst1_u8(s, d0u8);
s += pitch;
vst1_u8(s, d1u8);
s += pitch;
vst1_u8(s, d2u8);
s += pitch;
vst1_u8(s, d3u8);
s += pitch;
vst1_u8(s, d4u8);
s += pitch;
vst1_u8(s, d5u8);
}
return;
}
void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
uint8_t *s;
uint8x8_t dblimit, dlimit, dthresh;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint8x8_t d16u8, d17u8, d18u8;
uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
uint8x8x4_t d4Result;
uint8x8x2_t d2Result;
dblimit = vld1_u8(blimit);
dlimit = vld1_u8(limit);
dthresh = vld1_u8(thresh);
for (i = 0; i < 1; i++) {
s = src + (i * (pitch << 3)) - 4;
d3u8 = vld1_u8(s);
s += pitch;
d4u8 = vld1_u8(s);
s += pitch;
d5u8 = vld1_u8(s);
s += pitch;
d6u8 = vld1_u8(s);
s += pitch;
d7u8 = vld1_u8(s);
s += pitch;
d16u8 = vld1_u8(s);
s += pitch;
d17u8 = vld1_u8(s);
s += pitch;
d18u8 = vld1_u8(s);
d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));
d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
vreinterpret_u16_u32(d2tmp2.val[0]));
d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
vreinterpret_u16_u32(d2tmp3.val[0]));
d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
vreinterpret_u16_u32(d2tmp2.val[1]));
d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
vreinterpret_u16_u32(d2tmp3.val[1]));
d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
vreinterpret_u8_u16(d2tmp5.val[0]));
d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
vreinterpret_u8_u16(d2tmp5.val[1]));
d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
vreinterpret_u8_u16(d2tmp7.val[0]));
d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
vreinterpret_u8_u16(d2tmp7.val[1]));
d3u8 = d2tmp8.val[0];
d4u8 = d2tmp8.val[1];
d5u8 = d2tmp9.val[0];
d6u8 = d2tmp9.val[1];
d7u8 = d2tmp10.val[0];
d16u8 = d2tmp10.val[1];
d17u8 = d2tmp11.val[0];
d18u8 = d2tmp11.val[1];
mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
&d5u8);
d4Result.val[0] = d0u8;
d4Result.val[1] = d1u8;
d4Result.val[2] = d2u8;
d4Result.val[3] = d3u8;
d2Result.val[0] = d4u8;
d2Result.val[1] = d5u8;
s = src - 3;
vst4_lane_u8(s, d4Result, 0);
s += pitch;
vst4_lane_u8(s, d4Result, 1);
s += pitch;
vst4_lane_u8(s, d4Result, 2);
s += pitch;
vst4_lane_u8(s, d4Result, 3);
s += pitch;
vst4_lane_u8(s, d4Result, 4);
s += pitch;
vst4_lane_u8(s, d4Result, 5);
s += pitch;
vst4_lane_u8(s, d4Result, 6);
s += pitch;
vst4_lane_u8(s, d4Result, 7);
s = src + 1;
vst2_lane_u8(s, d2Result, 0);
s += pitch;
vst2_lane_u8(s, d2Result, 1);
s += pitch;
vst2_lane_u8(s, d2Result, 2);
s += pitch;
vst2_lane_u8(s, d2Result, 3);
s += pitch;
vst2_lane_u8(s, d2Result, 4);
s += pitch;
vst2_lane_u8(s, d2Result, 5);
s += pitch;
vst2_lane_u8(s, d2Result, 6);
s += pitch;
vst2_lane_u8(s, d2Result, 7);
}
return;
}

View File

@@ -1,49 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
#include "./aom_config.h"
#include "aom/aom_integer.h"
void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
#if HAVE_NEON_ASM
void aom_lpf_horizontal_8_dual_neon(
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
const uint8_t *limit1, const uint8_t *thresh1) {
aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
}
void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
}
void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh) {
aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
}
#endif // HAVE_NEON_ASM

View File

@@ -1,225 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
const uint32x4_t vec_l_lo =
vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
const uint32x4_t vec_l_hi =
vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
return vget_lane_u32(c, 0);
}
// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
// and vec_sum_ref_hi.
static void sad_neon_64(const uint8x16_t vec_src_00,
const uint8x16_t vec_src_16,
const uint8x16_t vec_src_32,
const uint8x16_t vec_src_48, const uint8_t *ref,
uint16x8_t *vec_sum_ref_lo,
uint16x8_t *vec_sum_ref_hi) {
const uint8x16_t vec_ref_00 = vld1q_u8(ref);
const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
*vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
vget_low_u8(vec_ref_00));
*vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
vget_high_u8(vec_ref_00));
*vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
vget_low_u8(vec_ref_16));
*vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
vget_high_u8(vec_ref_16));
*vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
vget_low_u8(vec_ref_32));
*vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
vget_high_u8(vec_ref_32));
*vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
vget_low_u8(vec_ref_48));
*vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
vget_high_u8(vec_ref_48));
}
// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
static void sad_neon_32(const uint8x16_t vec_src_00,
const uint8x16_t vec_src_16, const uint8_t *ref,
uint16x8_t *vec_sum_ref_lo,
uint16x8_t *vec_sum_ref_hi) {
const uint8x16_t vec_ref_00 = vld1q_u8(ref);
const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
*vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
vget_low_u8(vec_ref_00));
*vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
vget_high_u8(vec_ref_00));
*vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
vget_low_u8(vec_ref_16));
*vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
vget_high_u8(vec_ref_16));
}
void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
const uint8_t *ref0, *ref1, *ref2, *ref3;
ref0 = ref[0];
ref1 = ref[1];
ref2 = ref[2];
ref3 = ref[3];
for (i = 0; i < 64; ++i) {
const uint8x16_t vec_src_00 = vld1q_u8(src);
const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
&vec_sum_ref0_lo, &vec_sum_ref0_hi);
sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
&vec_sum_ref1_lo, &vec_sum_ref1_hi);
sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
&vec_sum_ref2_lo, &vec_sum_ref2_hi);
sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
&vec_sum_ref3_lo, &vec_sum_ref3_hi);
src += src_stride;
ref0 += ref_stride;
ref1 += ref_stride;
ref2 += ref_stride;
ref3 += ref_stride;
}
res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}
void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
const uint8_t *ref0, *ref1, *ref2, *ref3;
ref0 = ref[0];
ref1 = ref[1];
ref2 = ref[2];
ref3 = ref[3];
for (i = 0; i < 32; ++i) {
const uint8x16_t vec_src_00 = vld1q_u8(src);
const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
&vec_sum_ref0_hi);
sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
&vec_sum_ref1_hi);
sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
&vec_sum_ref2_hi);
sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
&vec_sum_ref3_hi);
src += src_stride;
ref0 += ref_stride;
ref1 += ref_stride;
ref2 += ref_stride;
ref3 += ref_stride;
}
res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}
void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
int i;
uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
const uint8_t *ref0, *ref1, *ref2, *ref3;
ref0 = ref[0];
ref1 = ref[1];
ref2 = ref[2];
ref3 = ref[3];
for (i = 0; i < 16; ++i) {
const uint8x16_t vec_src = vld1q_u8(src);
const uint8x16_t vec_ref0 = vld1q_u8(ref0);
const uint8x16_t vec_ref1 = vld1q_u8(ref1);
const uint8x16_t vec_ref2 = vld1q_u8(ref2);
const uint8x16_t vec_ref3 = vld1q_u8(ref3);
vec_sum_ref0_lo =
vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref0));
vec_sum_ref1_lo =
vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref1));
vec_sum_ref2_lo =
vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref2));
vec_sum_ref3_lo =
vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
vget_high_u8(vec_ref3));
src += src_stride;
ref0 += ref_stride;
ref1 += ref_stride;
ref2 += ref_stride;
ref3 += ref_stride;
}
res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
}

View File

@@ -1,224 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
unsigned char *ref_ptr, int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 15; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
unsigned char *ref_ptr, int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x2_t d1;
uint64x1_t d3;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 3; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
d1 = vpaddl_u16(vget_low_u16(q12));
d3 = vpaddl_u32(d1);
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
}
unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
unsigned char *ref_ptr, int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 7; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
const uint32x4_t vec_l_lo =
vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
const uint32x4_t vec_l_hi =
vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
return vget_lane_u32(c, 0);
}
static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
const uint32x4_t a = vpaddlq_u16(vec_16x8);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
return vget_lane_u32(c, 0);
}
unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
uint16x8_t vec_accum_hi = vdupq_n_u16(0);
for (i = 0; i < 64; ++i) {
const uint8x16_t vec_src_00 = vld1q_u8(src);
const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
const uint8x16_t vec_ref_00 = vld1q_u8(ref);
const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
src += src_stride;
ref += ref_stride;
vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
vget_low_u8(vec_ref_00));
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
vget_high_u8(vec_ref_00));
vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
vget_low_u8(vec_ref_16));
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
vget_high_u8(vec_ref_16));
vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
vget_low_u8(vec_ref_32));
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
vget_high_u8(vec_ref_32));
vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
vget_low_u8(vec_ref_48));
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
vget_high_u8(vec_ref_48));
}
return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
}
unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
uint16x8_t vec_accum_hi = vdupq_n_u16(0);
for (i = 0; i < 32; ++i) {
const uint8x16_t vec_src_00 = vld1q_u8(src);
const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
const uint8x16_t vec_ref_00 = vld1q_u8(ref);
const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
src += src_stride;
ref += ref_stride;
vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
vget_low_u8(vec_ref_00));
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
vget_high_u8(vec_ref_00));
vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
vget_low_u8(vec_ref_16));
vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
vget_high_u8(vec_ref_16));
}
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum_lo = vdupq_n_u16(0);
uint16x8_t vec_accum_hi = vdupq_n_u16(0);
for (i = 0; i < 16; ++i) {
const uint8x16_t vec_src = vld1q_u8(src);
const uint8x16_t vec_ref = vld1q_u8(ref);
src += src_stride;
ref += ref_stride;
vec_accum_lo =
vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
vec_accum_hi =
vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
}
return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
}
unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride) {
int i;
uint16x8_t vec_accum = vdupq_n_u16(0);
for (i = 0; i < 8; ++i) {
const uint8x8_t vec_src = vld1_u8(src);
const uint8x8_t vec_ref = vld1_u8(ref);
src += src_stride;
ref += ref_stride;
vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
}
return horizontal_add_16x8(vec_accum);
}

View File

@@ -1,39 +0,0 @@
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
;
EXPORT |aom_push_neon|
EXPORT |aom_pop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
|aom_push_neon| PROC
vst1.i64 {d8, d9, d10, d11}, [r0]!
vst1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
|aom_pop_neon| PROC
vld1.i64 {d8, d9, d10, d11}, [r0]!
vld1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
END

View File

@@ -1,81 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_config.h"
#include "./aom_dsp_rtcd.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#if HAVE_MEDIA
static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
{ 96, 32 }, { 80, 48 },
{ 64, 64 }, { 48, 80 },
{ 32, 96 }, { 16, 112 } };
extern void aom_filter_block2d_bil_first_pass_media(
const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
uint32_t height, uint32_t width, const int16_t *filter);
extern void aom_filter_block2d_bil_second_pass_media(
const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
uint32_t height, uint32_t width, const int16_t *filter);
unsigned int aom_sub_pixel_variance8x8_media(
const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
uint16_t first_pass[10 * 8];
uint8_t second_pass[8 * 8];
const int16_t *HFilter, *VFilter;
HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset];
aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
src_pixels_per_line, 9, 8, HFilter);
aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
VFilter);
return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
sse);
}
unsigned int aom_sub_pixel_variance16x16_media(
const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
uint16_t first_pass[36 * 16];
uint8_t second_pass[20 * 16];
const int16_t *HFilter, *VFilter;
unsigned int var;
if (xoffset == 4 && yoffset == 0) {
var = aom_variance_halfpixvar16x16_h_media(
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
} else if (xoffset == 0 && yoffset == 4) {
var = aom_variance_halfpixvar16x16_v_media(
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
} else if (xoffset == 4 && yoffset == 4) {
var = aom_variance_halfpixvar16x16_hv_media(
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
} else {
HFilter = bilinear_filters_media[xoffset];
VFilter = bilinear_filters_media[yoffset];
aom_filter_block2d_bil_first_pass_media(
src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
16, VFilter);
var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
sse);
}
return var;
}
#endif // HAVE_MEDIA

View File

@@ -1,134 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
#include "./aom_config.h"
#include "aom_ports/mem.h"
#include "aom/aom_integer.h"
#include "aom_dsp/variance.h"
static const uint8_t bilinear_filters[8][2] = {
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
const uint8x8_t f0 = vmov_n_u8(filter[0]);
const uint8x8_t f1 = vmov_n_u8(filter[1]);
unsigned int i;
for (i = 0; i < output_height; ++i) {
const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
const uint16x8_t a = vmull_u8(src_0, f0);
const uint16x8_t b = vmlal_u8(a, src_1, f1);
const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
vst1_u8(&output_ptr[0], out);
// Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
}
static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
uint8_t *output_ptr,
unsigned int src_pixels_per_line,
int pixel_step,
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
const uint8x8_t f0 = vmov_n_u8(filter[0]);
const uint8x8_t f1 = vmov_n_u8(filter[1]);
unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 16) {
const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
}
// Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
}
unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
int xoffset, int yoffset,
const uint8_t *dst, int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
bilinear_filters[xoffset]);
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
bilinear_filters[yoffset]);
return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
int src_stride, int xoffset,
int yoffset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
bilinear_filters[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
bilinear_filters[yoffset]);
return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
int src_stride, int xoffset,
int yoffset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
bilinear_filters[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
bilinear_filters[yoffset]);
return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
}
unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
int src_stride, int xoffset,
int yoffset, const uint8_t *dst,
int dst_stride,
unsigned int *sse) {
DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
bilinear_filters[xoffset]);
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
bilinear_filters[yoffset]);
return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
}

View File

@@ -1,80 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
ptrdiff_t diff_stride, const uint8_t *src,
ptrdiff_t src_stride, const uint8_t *pred,
ptrdiff_t pred_stride) {
int r, c;
if (cols > 16) {
for (r = 0; r < rows; ++r) {
for (c = 0; c < cols; c += 32) {
const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
const uint16x8_t v_diff_lo_00 =
vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
const uint16x8_t v_diff_hi_00 =
vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
const uint16x8_t v_diff_lo_16 =
vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
const uint16x8_t v_diff_hi_16 =
vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
}
diff += diff_stride;
pred += pred_stride;
src += src_stride;
}
} else if (cols > 8) {
for (r = 0; r < rows; ++r) {
const uint8x16_t v_src = vld1q_u8(&src[0]);
const uint8x16_t v_pred = vld1q_u8(&pred[0]);
const uint16x8_t v_diff_lo =
vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
const uint16x8_t v_diff_hi =
vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
diff += diff_stride;
pred += pred_stride;
src += src_stride;
}
} else if (cols > 4) {
for (r = 0; r < rows; ++r) {
const uint8x8_t v_src = vld1_u8(&src[0]);
const uint8x8_t v_pred = vld1_u8(&pred[0]);
const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
diff += diff_stride;
pred += pred_stride;
src += src_stride;
}
} else {
for (r = 0; r < rows; ++r) {
for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
diff += diff_stride;
pred += pred_stride;
src += src_stride;
}
}
}

View File

@@ -1,361 +0,0 @@
;
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
;
;
EXPORT |aom_variance16x16_media|
EXPORT |aom_variance8x8_media|
EXPORT |aom_mse16x16_media|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|aom_variance16x16_media| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r8, #0 ; initialize sum = 0
mov r11, #0 ; initialize sse = 0
mov r12, #16 ; set loop counter to 16 (=block height)
loop16x16
; 1st 4 pixels
ldr r4, [r0, #0] ; load 4 src pixels
ldr r5, [r2, #0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r6, r4, r5 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r4, [r0, #4] ; load 4 src pixels
ldr r5, [r2, #4] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r4, [r0, #8] ; load 4 src pixels
ldr r5, [r2, #8] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r4, [r0, #12] ; load 4 src pixels
ldr r5, [r2, #12] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
subs r12, r12, #1
bne loop16x16
; return stuff
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|aom_variance8x8_media| PROC
push {r4-r10, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #8 ; set loop counter to 8 (=block height)
mov r4, #0 ; initialize sum = 0
mov r5, #0 ; initialize sse = 0
loop8x8
; 1st 4 pixels
ldr r6, [r0, #0x0] ; load 4 src pixels
ldr r7, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r6, r7 ; calculate difference
pld [r0, r1, lsl #1]
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r0, #0x4] ; load 4 src pixels
ldr r7, [r2, #0x4] ; load 4 ref pixels
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
usub8 r8, r6, r7 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
subs r12, r12, #1 ; next row
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
bne loop8x8
; return stuff
ldr r8, [sp, #32] ; get address of sse
mul r1, r4, r4 ; sum * sum
str r5, [r8] ; store sse
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
pop {r4-r10, pc}
ENDP
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;
;note: Based on aom_variance16x16_media. In this function, sum is never used.
; So, we can remove this part of calculation.
|aom_mse16x16_media| PROC
push {r4-r9, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #16 ; set loop counter to 16 (=block height)
mov r4, #0 ; initialize sse = 0
loopmse
; 1st 4 pixels
ldr r5, [r0, #0x0] ; load 4 src pixels
ldr r6, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r5, r6 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x4] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r2, #0x4] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x8] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r6, [r2, #0x8] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0xc] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r6, [r2, #0xc] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
subs r12, r12, #1 ; next row
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
bne loopmse
; return stuff
ldr r1, [sp, #28] ; get address of sse
mov r0, r4 ; return sse
str r4, [r1] ; store sse
pop {r4-r9, pc}
ENDP
END

View File

@@ -1,400 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <arm_neon.h>
#include "./aom_dsp_rtcd.h"
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
const int32x4_t a = vpaddlq_s16(v_16x8);
const int64x2_t b = vpaddlq_s32(a);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
return vget_lane_s32(c, 0);
}
static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
const int64x2_t b = vpaddlq_s32(v_32x4);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
return vget_lane_s32(c, 0);
}
// w * h must be less than 2048 or local variable v_sum may overflow.
static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h, uint32_t *sse,
int *sum) {
int i, j;
int16x8_t v_sum = vdupq_n_s16(0);
int32x4_t v_sse_lo = vdupq_n_s32(0);
int32x4_t v_sse_hi = vdupq_n_s32(0);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; j += 8) {
const uint8x8_t v_a = vld1_u8(&a[j]);
const uint8x8_t v_b = vld1_u8(&b[j]);
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
v_sum = vaddq_s16(v_sum, sv_diff);
v_sse_lo =
vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
v_sse_hi =
vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
}
a += a_stride;
b += b_stride;
}
*sum = horizontal_add_s16x8(v_sum);
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
}
void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
}
void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, unsigned int *sse, int *sum) {
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
}
unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
}
unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
}
unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
}
unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
32, 32, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
}
unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
int sum1, sum2;
uint32_t sse1, sse2;
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
b_stride, 64, 16, &sse2, &sum2);
sse1 += sse2;
sum1 += sum2;
variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
b_stride, 64, 16, &sse2, &sum2);
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
}
unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride, unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 4; i++) {
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride, unsigned int *sse) {
int i;
uint8x8_t d0u8, d2u8, d4u8, d6u8;
int16x4_t d22s16, d23s16, d24s16, d25s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) {
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d2u8, d6u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
const unsigned char *ref_ptr, int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
int64x1_t d0s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
q7s32 = vdupq_n_s32(0);
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q10s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride) {
int16x4_t d22s16, d24s16, d26s16, d28s16;
int64x1_t d0s64;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d1u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d5u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d3u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d7u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d1u8, d5u8);
q13u16 = vsubl_u8(d2u8, d6u8);
q14u16 = vsubl_u8(d3u8, d7u8);
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
q7s32 = vmull_s16(d22s16, d22s16);
q8s32 = vmull_s16(d24s16, d24s16);
q9s32 = vmull_s16(d26s16, d26s16);
q10s32 = vmull_s16(d28s16, d28s16);
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q9s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q9s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}

View File

@@ -1,232 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <stdlib.h>
#include "./aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
for (i = 0; i < 8; ++i, src += stride)
for (j = 0; j < 8; sum += src[j], ++j) {
}
return ROUND_POWER_OF_TWO(sum, 6);
}
unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
for (i = 0; i < 4; ++i, src += stride)
for (j = 0; j < 4; sum += src[j], ++j) {
}
return ROUND_POWER_OF_TWO(sum, 4);
}
// src_diff: first pass, 9 bit, dynamic range [-255, 255]
// second pass, 12 bit, dynamic range [-2040, 2040]
static void hadamard_col8(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
int16_t c0 = b0 + b2;
int16_t c1 = b1 + b3;
int16_t c2 = b0 - b2;
int16_t c3 = b1 - b3;
int16_t c4 = b4 + b6;
int16_t c5 = b5 + b7;
int16_t c6 = b4 - b6;
int16_t c7 = b5 - b7;
coeff[0] = c0 + c4;
coeff[7] = c1 + c5;
coeff[3] = c2 + c6;
coeff[4] = c3 + c7;
coeff[2] = c0 - c4;
coeff[6] = c1 - c5;
coeff[1] = c2 - c6;
coeff[5] = c3 - c7;
}
// The order of the output coeff of the hadamard is not important. For
// optimization purposes the final transpose may be skipped.
void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int idx;
int16_t buffer[64];
int16_t *tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
// dynamic range [-255, 255]
tmp_buf += 8;
++src_diff;
}
tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
// dynamic range [-2040, 2040]
coeff += 8; // coeff: 15 bit
// dynamic range [-16320, 16320]
++tmp_buf;
}
}
// In place 16x16 2D Hadamard transform
void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255]
const int16_t *src_ptr =
src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
}
// coeff: 15 bit, dynamic range [-16320, 16320]
for (idx = 0; idx < 64; ++idx) {
int16_t a0 = coeff[0];
int16_t a1 = coeff[64];
int16_t a2 = coeff[128];
int16_t a3 = coeff[192];
int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
int16_t b3 = (a2 - a3) >> 1;
coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
coeff[64] = b1 + b3;
coeff[128] = b0 - b2;
coeff[192] = b1 - b3;
++coeff;
}
}
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
int aom_satd_c(const int16_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i) satd += abs(coeff[i]);
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
return satd;
}
// Integer projection onto row vectors.
// height: value range {16, 32, 64}.
void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
const int ref_stride, const int height) {
int idx;
const int norm_factor = height >> 1;
for (idx = 0; idx < 16; ++idx) {
int i;
hbuf[idx] = 0;
// hbuf[idx]: 14 bit, dynamic range [0, 16320].
for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
// hbuf[idx]: 9 bit, dynamic range [0, 510].
hbuf[idx] /= norm_factor;
++ref;
}
}
// width: value range {16, 32, 64}.
int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
int idx;
int16_t sum = 0;
// sum: 14 bit, dynamic range [0, 16320]
for (idx = 0; idx < width; ++idx) sum += ref[idx];
return sum;
}
// ref: [0 - 510]
// src: [0 - 510]
// bwl: {2, 3, 4}
int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
int i;
int width = 4 << bwl;
int sse = 0, mean = 0, var;
for (i = 0; i < width; ++i) {
int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits.
mean += diff; // mean: dynamic range 16 bits.
sse += diff * diff; // sse: dynamic range 26 bits.
}
// (mean * mean): dynamic range 31 bits.
var = sse - ((mean * mean) >> (bwl + 2));
return var;
}
void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
int ref_stride, int *min, int *max) {
int i, j;
*min = 255;
*max = 0;
for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
for (j = 0; j < 8; ++j) {
int diff = abs(src[j] - ref[j]);
*min = diff < *min ? diff : *min;
*max = diff > *max ? diff : *max;
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
const uint16_t *s = CONVERT_TO_SHORTPTR(src);
for (i = 0; i < 8; ++i, s += stride)
for (j = 0; j < 8; sum += s[j], ++j) {
}
return ROUND_POWER_OF_TWO(sum, 6);
}
unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
const uint16_t *s = CONVERT_TO_SHORTPTR(src);
for (i = 0; i < 4; ++i, s += stride)
for (j = 0; j < 4; sum += s[j], ++j) {
}
return ROUND_POWER_OF_TWO(sum, 4);
}
void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
int dp, int *min, int *max) {
int i, j;
const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
*min = 255;
*max = 0;
for (i = 0; i < 8; ++i, s += p, d += dp) {
for (j = 0; j < 8; ++j) {
int diff = abs(s[j] - d[j]);
*min = diff < *min ? diff : *min;
*max = diff > *max ? diff : *max;
}
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH

View File

@@ -1,240 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_BITREADER_H_
#define AOM_DSP_BITREADER_H_
#include <assert.h>
#include <limits.h>
#include "./aom_config.h"
#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
#endif
#include "aom/aomdx.h"
#include "aom/aom_integer.h"
#if CONFIG_ANS
#include "aom_dsp/ansreader.h"
#elif CONFIG_DAALA_EC
#include "aom_dsp/daalaboolreader.h"
#else
#include "aom_dsp/dkboolreader.h"
#endif
#include "aom_dsp/prob.h"
#include "av1/common/odintrin.h"
#if CONFIG_ACCOUNTING
#include "av1/common/accounting.h"
#define ACCT_STR_NAME acct_str
#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
#define ACCT_STR_ARG(s) , s
#else
#define ACCT_STR_PARAM
#define ACCT_STR_ARG(s)
#endif
#define aom_read(r, prob, ACCT_STR_NAME) \
aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
#define aom_read_bit(r, ACCT_STR_NAME) \
aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
#define aom_read_literal(r, bits, ACCT_STR_NAME) \
aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
#define aom_read_tree_bits(r, tree, probs, ACCT_STR_NAME) \
aom_read_tree_bits_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
#ifdef __cplusplus
extern "C" {
#endif
#if CONFIG_ANS
typedef struct AnsDecoder aom_reader;
#elif CONFIG_DAALA_EC
typedef struct daala_reader aom_reader;
#else
typedef struct aom_dk_reader aom_reader;
#endif
static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
size_t size, aom_decrypt_cb decrypt_cb,
void *decrypt_state) {
#if CONFIG_ANS
(void)decrypt_cb;
(void)decrypt_state;
assert(size <= INT_MAX);
return ans_read_init(r, buffer, size);
#elif CONFIG_DAALA_EC
(void)decrypt_cb;
(void)decrypt_state;
return aom_daala_reader_init(r, buffer, size);
#else
return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state);
#endif
}
static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
#if CONFIG_ANS
(void)r;
assert(0 && "Use the raw buffer size with ANS");
return NULL;
#elif CONFIG_DAALA_EC
return aom_daala_reader_find_end(r);
#else
return aom_dk_reader_find_end(r);
#endif
}
static INLINE int aom_reader_has_error(aom_reader *r) {
#if CONFIG_ANS
return ans_reader_has_error(r);
#elif CONFIG_DAALA_EC
return aom_daala_reader_has_error(r);
#else
return aom_dk_reader_has_error(r);
#endif
}
// Returns the position in the bit reader in bits.
static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
#if CONFIG_ANS
(void)r;
assert(0 && "aom_reader_tell() is unimplemented for ANS");
return 0;
#elif CONFIG_DAALA_EC
return aom_daala_reader_tell(r);
#else
return aom_dk_reader_tell(r);
#endif
}
// Returns the position in the bit reader in 1/8th bits.
static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
#if CONFIG_ANS
(void)r;
assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
return 0;
#elif CONFIG_DAALA_EC
return aom_daala_reader_tell_frac(r);
#else
return aom_dk_reader_tell_frac(r);
#endif
}
#if CONFIG_ACCOUNTING
static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
if (r->accounting != NULL) {
uint32_t tell_frac;
tell_frac = aom_reader_tell_frac(r);
aom_accounting_record(r->accounting, ACCT_STR_NAME,
tell_frac - r->accounting->last_tell_frac);
r->accounting->last_tell_frac = tell_frac;
}
}
#endif
static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
int ret;
#if CONFIG_ANS
ret = uabs_read(r, prob);
#elif CONFIG_DAALA_EC
ret = aom_daala_read(r, prob);
#else
ret = aom_dk_read(r, prob);
#endif
#if CONFIG_ACCOUNTING
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
#endif
return ret;
}
static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
int ret;
#if CONFIG_ANS
ret = uabs_read_bit(r); // Non trivial optimization at half probability
#else
ret = aom_read(r, 128, NULL); // aom_prob_half
#endif
#if CONFIG_ACCOUNTING
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
#endif
return ret;
}
static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
int literal = 0, bit;
for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
#if CONFIG_ACCOUNTING
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
#endif
return literal;
}
static INLINE int aom_read_tree_bits_(aom_reader *r, const aom_tree_index *tree,
const aom_prob *probs ACCT_STR_PARAM) {
aom_tree_index i = 0;
while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue;
#if CONFIG_ACCOUNTING
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
#endif
return -i;
}
static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
const aom_prob *probs ACCT_STR_PARAM) {
int ret;
#if CONFIG_DAALA_EC
ret = daala_read_tree_bits(r, tree, probs);
#else
ret = aom_read_tree_bits(r, tree, probs, NULL);
#endif
#if CONFIG_ACCOUNTING
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
#endif
return ret;
}
#if CONFIG_EC_MULTISYMBOL
static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
int nsymbs ACCT_STR_PARAM) {
int ret;
#if CONFIG_RANS
(void)nsymbs;
ret = rans_read(r, cdf);
#elif CONFIG_DAALA_EC
ret = daala_read_symbol(r, cdf, nsymbs);
#else
#error \
"CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
"coder. Enable daala_ec or ans for a valid configuration."
#endif
#if CONFIG_EC_ADAPT
update_cdf(cdf, ret, nsymbs);
#endif
#if CONFIG_ACCOUNTING
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
#endif
return ret;
}
#endif // CONFIG_EC_MULTISYMBOL
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_BITREADER_H_

View File

@@ -1,47 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "./aom_config.h"
#include "./bitreader_buffer.h"
size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
return (rb->bit_offset + 7) >> 3;
}
int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
const size_t off = rb->bit_offset;
const size_t p = off >> 3;
const int q = 7 - (int)(off & 0x7);
if (rb->bit_buffer + p < rb->bit_buffer_end) {
const int bit = (rb->bit_buffer[p] >> q) & 1;
rb->bit_offset = off + 1;
return bit;
} else {
rb->error_handler(rb->error_handler_data);
return 0;
}
}
int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
int value = 0, bit;
for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
return value;
}
int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
const int value = aom_rb_read_literal(rb, bits);
return aom_rb_read_bit(rb) ? -value : value;
}
int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
const int nbits = sizeof(unsigned) * 8 - bits - 1;
const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
return ((int)value) >> nbits;
}

View File

@@ -1,48 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_BITREADER_BUFFER_H_
#define AOM_DSP_BITREADER_BUFFER_H_
#include <limits.h>
#include "aom/aom_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef void (*aom_rb_error_handler)(void *data);
struct aom_read_bit_buffer {
const uint8_t *bit_buffer;
const uint8_t *bit_buffer_end;
size_t bit_offset;
void *error_handler_data;
aom_rb_error_handler error_handler;
};
size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_BITREADER_BUFFER_H_

View File

@@ -1,179 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_BITWRITER_H_
#define AOM_DSP_BITWRITER_H_
#include <assert.h>
#include "./aom_config.h"
#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
#endif
#if CONFIG_ANS
#include "aom_dsp/buf_ans.h"
#elif CONFIG_DAALA_EC
#include "aom_dsp/daalaboolwriter.h"
#else
#include "aom_dsp/dkboolwriter.h"
#endif
#include "aom_dsp/prob.h"
#if CONFIG_RD_DEBUG
#include "av1/encoder/cost.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
#if CONFIG_ANS
typedef struct BufAnsCoder aom_writer;
#elif CONFIG_DAALA_EC
typedef struct daala_writer aom_writer;
#else
typedef struct aom_dk_writer aom_writer;
#endif
typedef struct TOKEN_STATS { int64_t cost; } TOKEN_STATS;
static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
#if CONFIG_ANS
(void)bc;
(void)buffer;
assert(0 && "buf_ans requires a more complicated startup procedure");
#elif CONFIG_DAALA_EC
aom_daala_start_encode(bc, buffer);
#else
aom_dk_start_encode(bc, buffer);
#endif
}
static INLINE void aom_stop_encode(aom_writer *bc) {
#if CONFIG_ANS
(void)bc;
assert(0 && "buf_ans requires a more complicated shutdown procedure");
#elif CONFIG_DAALA_EC
aom_daala_stop_encode(bc);
#else
aom_dk_stop_encode(bc);
#endif
}
static INLINE void aom_write(aom_writer *br, int bit, int probability) {
#if CONFIG_ANS
buf_uabs_write(br, bit, probability);
#elif CONFIG_DAALA_EC
aom_daala_write(br, bit, probability);
#else
aom_dk_write(br, bit, probability);
#endif
}
static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
TOKEN_STATS *token_stats) {
aom_write(br, bit, probability);
#if CONFIG_RD_DEBUG
token_stats->cost += av1_cost_bit(probability, bit);
#else
(void)token_stats;
#endif
}
static INLINE void aom_write_bit(aom_writer *w, int bit) {
aom_write(w, bit, 128); // aom_prob_half
}
static INLINE void aom_write_bit_record(aom_writer *w, int bit,
TOKEN_STATS *token_stats) {
aom_write_record(w, bit, 128, token_stats); // aom_prob_half
}
static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
int bit;
for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
}
static INLINE void aom_write_tree_bits(aom_writer *w, const aom_tree_index *tr,
const aom_prob *probs, int bits, int len,
aom_tree_index i) {
do {
const int bit = (bits >> --len) & 1;
aom_write(w, bit, probs[i >> 1]);
i = tr[i + bit];
} while (len);
}
static INLINE void aom_write_tree_bits_record(aom_writer *w,
const aom_tree_index *tr,
const aom_prob *probs, int bits,
int len, aom_tree_index i,
TOKEN_STATS *token_stats) {
do {
const int bit = (bits >> --len) & 1;
aom_write_record(w, bit, probs[i >> 1], token_stats);
i = tr[i + bit];
} while (len);
}
static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
const aom_prob *probs, int bits, int len,
aom_tree_index i) {
#if CONFIG_DAALA_EC
daala_write_tree_bits(w, tree, probs, bits, len, i);
#else
aom_write_tree_bits(w, tree, probs, bits, len, i);
#endif
}
static INLINE void aom_write_tree_record(aom_writer *w,
const aom_tree_index *tree,
const aom_prob *probs, int bits,
int len, aom_tree_index i,
TOKEN_STATS *token_stats) {
#if CONFIG_DAALA_EC
(void)token_stats;
daala_write_tree_bits(w, tree, probs, bits, len, i);
#else
aom_write_tree_bits_record(w, tree, probs, bits, len, i, token_stats);
#endif
}
#if CONFIG_EC_MULTISYMBOL
static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
int nsymbs) {
#if CONFIG_RANS
struct rans_sym s;
(void)nsymbs;
assert(cdf);
s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
s.prob = cdf[symb] - s.cum_prob;
buf_rans_write(w, &s);
#elif CONFIG_DAALA_EC
daala_write_symbol(w, symb, cdf, nsymbs);
#else
#error \
"CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
"coder. Enable daala_ec or ans for a valid configuration."
#endif
#if CONFIG_EC_ADAPT
update_cdf(cdf, symb, nsymbs);
#endif
}
#endif // CONFIG_EC_MULTISYMBOL
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_BITWRITER_H_

View File

@@ -1,43 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <limits.h>
#include <stdlib.h>
#include "./aom_config.h"
#include "./bitwriter_buffer.h"
size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
}
void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
const int off = (int)wb->bit_offset;
const int p = off / CHAR_BIT;
const int q = CHAR_BIT - 1 - off % CHAR_BIT;
if (q == CHAR_BIT - 1) {
wb->bit_buffer[p] = bit << q;
} else {
wb->bit_buffer[p] &= ~(1 << q);
wb->bit_buffer[p] |= bit << q;
}
wb->bit_offset = off + 1;
}
void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
int bit;
for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
}
void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
int bits) {
aom_wb_write_literal(wb, data, bits + 1);
}

View File

@@ -1,39 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_BITWRITER_BUFFER_H_
#define AOM_DSP_BITWRITER_BUFFER_H_
#include "aom/aom_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
struct aom_write_bit_buffer {
uint8_t *bit_buffer;
size_t bit_offset;
};
size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
int bits);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_BITWRITER_BUFFER_H_

View File

@@ -1,42 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_BLEND_H_
#define AOM_DSP_BLEND_H_
#include "aom_ports/mem.h"
// Various blending functions and macros.
// See also the aom_blend_* functions in aom_dsp_rtcd.h
// Alpha blending with alpha values from the range [0, 64], where 64
// means use the first input and 0 means use the second input.
#define AOM_BLEND_A64_ROUND_BITS 6
#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64
#define AOM_BLEND_A64(a, v0, v1) \
ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
AOM_BLEND_A64_ROUND_BITS)
// Alpha blending with alpha values from the range [0, 256], where 256
// means use the first input and 0 means use the second input.
#define AOM_BLEND_A256_ROUND_BITS 8
#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256
#define AOM_BLEND_A256(a, v0, v1) \
ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
AOM_BLEND_A256_ROUND_BITS)
// Blending by averaging.
#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
#endif // AOM_DSP_BLEND_H_

View File

@@ -1,71 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/blend.h"
#include "./aom_dsp_rtcd.h"
void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, int h, int w) {
int i, j;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(
mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
const uint8_t *src0_8, uint32_t src0_stride,
const uint8_t *src1_8, uint32_t src1_stride,
const uint8_t *mask, int h, int w, int bd) {
int i, j;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
(void)bd;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(
mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
}
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH

View File

@@ -1,145 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#include "aom_dsp/blend.h"
#include "aom_dsp/aom_dsp_common.h"
#include "./aom_dsp_rtcd.h"
// Blending with alpha mask. Mask values come from the range [0, 64],
// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
// be the same as dst, or dst can be different from both sources.
void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride, int h,
int w, int subh, int subw) {
int i, j;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
if (subw == 0 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = mask[i * mask_stride + j];
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 1) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = ROUND_POWER_OF_TWO(
mask[(2 * i) * mask_stride + (2 * j)] +
mask[(2 * i + 1) * mask_stride + (2 * j)] +
mask[(2 * i) * mask_stride + (2 * j + 1)] +
mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
2);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
mask[i * mask_stride + (2 * j + 1)]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
mask[(2 * i + 1) * mask_stride + j]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
const uint8_t *src0_8, uint32_t src0_stride,
const uint8_t *src1_8, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, int subh, int subw, int bd) {
int i, j;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
(void)bd;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
if (subw == 0 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = mask[i * mask_stride + j];
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 1) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = ROUND_POWER_OF_TWO(
mask[(2 * i) * mask_stride + (2 * j)] +
mask[(2 * i + 1) * mask_stride + (2 * j)] +
mask[(2 * i) * mask_stride + (2 * j + 1)] +
mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
2);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else if (subw == 1 && subh == 0) {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
mask[i * mask_stride + (2 * j + 1)]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
} else {
for (i = 0; i < h; ++i) {
for (j = 0; j < w; ++j) {
const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
mask[(2 * i + 1) * mask_stride + j]);
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH

View File

@@ -1,73 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "aom/aom_integer.h"
#include "aom_ports/mem.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/blend.h"
#include "./aom_dsp_rtcd.h"
void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
const uint8_t *src0, uint32_t src0_stride,
const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, int h, int w) {
int i, j;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
for (i = 0; i < h; ++i) {
const int m = mask[i];
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
const uint8_t *src0_8, uint32_t src0_stride,
const uint8_t *src1_8, uint32_t src1_stride,
const uint8_t *mask, int h, int w, int bd) {
int i, j;
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
(void)bd;
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
assert(h >= 1);
assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
for (i = 0; i < h; ++i) {
const int m = mask[i];
for (j = 0; j < w; ++j) {
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
src1[i * src1_stride + j]);
}
}
}
#endif // CONFIG_AOM_HIGHBITDEPTH

View File

@@ -1,42 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <string.h>
#include "aom_dsp/buf_ans.h"
#include "aom_mem/aom_mem.h"
#include "aom/internal/aom_codec_internal.h"
void aom_buf_ans_alloc(struct BufAnsCoder *c,
struct aom_internal_error_info *error, int size_hint) {
c->error = error;
c->size = size_hint;
AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
// Initialize to overfull to trigger the assert in write.
c->offset = c->size + 1;
}
void aom_buf_ans_free(struct BufAnsCoder *c) {
aom_free(c->buf);
c->buf = NULL;
c->size = 0;
}
void aom_buf_ans_grow(struct BufAnsCoder *c) {
struct buffered_ans_symbol *new_buf = NULL;
int new_size = c->size * 2;
AOM_CHECK_MEM_ERROR(c->error, new_buf,
aom_malloc(new_size * sizeof(*new_buf)));
memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
aom_free(c->buf);
c->buf = new_buf;
c->size = new_size;
}

View File

@@ -1,112 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_BUF_ANS_H_
#define AOM_DSP_BUF_ANS_H_
// Buffered forward ANS writer.
// Symbols are written to the writer in forward (decode) order and serialized
// backwards due to ANS's stack like behavior.
#include <assert.h>
#include "./aom_config.h"
#include "aom/aom_integer.h"
#include "aom_dsp/ans.h"
#include "aom_dsp/answriter.h"
#ifdef __cplusplus
extern "C" {
#endif // __cplusplus
#define ANS_METHOD_UABS 0
#define ANS_METHOD_RANS 1
struct buffered_ans_symbol {
unsigned int method : 1; // one of ANS_METHOD_UABS or ANS_METHOD_RANS
// TODO(aconverse): Should be possible to write this in terms of start for ABS
unsigned int val_start : RANS_PROB_BITS; // Boolean value for ABS
// start in symbol cycle for Rans
unsigned int prob : RANS_PROB_BITS; // Probability of this symbol
};
struct BufAnsCoder {
struct aom_internal_error_info *error;
struct buffered_ans_symbol *buf;
int size;
int offset;
};
void aom_buf_ans_alloc(struct BufAnsCoder *c,
struct aom_internal_error_info *error, int size_hint);
void aom_buf_ans_free(struct BufAnsCoder *c);
void aom_buf_ans_grow(struct BufAnsCoder *c);
static INLINE void buf_ans_write_reset(struct BufAnsCoder *const c) {
c->offset = 0;
}
static INLINE void buf_uabs_write(struct BufAnsCoder *const c, uint8_t val,
AnsP8 prob) {
assert(c->offset <= c->size);
if (c->offset == c->size) {
aom_buf_ans_grow(c);
}
c->buf[c->offset].method = ANS_METHOD_UABS;
c->buf[c->offset].val_start = val;
c->buf[c->offset].prob = prob;
++c->offset;
}
static INLINE void buf_rans_write(struct BufAnsCoder *const c,
const struct rans_sym *const sym) {
assert(c->offset <= c->size);
if (c->offset == c->size) {
aom_buf_ans_grow(c);
}
c->buf[c->offset].method = ANS_METHOD_RANS;
c->buf[c->offset].val_start = sym->cum_prob;
c->buf[c->offset].prob = sym->prob;
++c->offset;
}
static INLINE void buf_ans_flush(const struct BufAnsCoder *const c,
struct AnsCoder *ans) {
int offset;
for (offset = c->offset - 1; offset >= 0; --offset) {
if (c->buf[offset].method == ANS_METHOD_RANS) {
struct rans_sym sym;
sym.prob = c->buf[offset].prob;
sym.cum_prob = c->buf[offset].val_start;
rans_write(ans, &sym);
} else {
uabs_write(ans, (uint8_t)c->buf[offset].val_start,
(AnsP8)c->buf[offset].prob);
}
}
}
static INLINE void buf_uabs_write_bit(struct BufAnsCoder *c, int bit) {
buf_uabs_write(c, bit, 128);
}
static INLINE void buf_uabs_write_literal(struct BufAnsCoder *c, int literal,
int bits) {
int bit;
assert(bits < 31);
for (bit = bits - 1; bit >= 0; bit--)
buf_uabs_write_bit(c, 1 & (literal >> bit));
}
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
#endif // AOM_DSP_BUF_ANS_H_

View File

@@ -1,37 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include "aom_dsp/daalaboolreader.h"
int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
if (size && !buffer) {
return 1;
}
r->buffer_end = buffer + size;
r->buffer = buffer;
od_ec_dec_init(&r->ec, buffer, size - 1);
#if CONFIG_ACCOUNTING
r->accounting = NULL;
#endif
return 0;
}
const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
return r->buffer_end;
}
uint32_t aom_daala_reader_tell(const daala_reader *r) {
return od_ec_dec_tell(&r->ec);
}
uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
return od_ec_dec_tell_frac(&r->ec);
}

View File

@@ -1,87 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_DAALABOOLREADER_H_
#define AOM_DSP_DAALABOOLREADER_H_
#include "aom/aom_integer.h"
#include "aom_dsp/entdec.h"
#include "aom_dsp/prob.h"
#if CONFIG_ACCOUNTING
#include "av1/common/accounting.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
struct daala_reader {
const uint8_t *buffer;
const uint8_t *buffer_end;
od_ec_dec ec;
#if CONFIG_ACCOUNTING
Accounting *accounting;
#endif
};
typedef struct daala_reader daala_reader;
int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
const uint8_t *aom_daala_reader_find_end(daala_reader *r);
uint32_t aom_daala_reader_tell(const daala_reader *r);
uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
static INLINE int aom_daala_read(daala_reader *r, int prob) {
if (prob == 128) {
return od_ec_dec_bits(&r->ec, 1, "aom_bits");
} else {
int p = ((prob << 15) + (256 - prob)) >> 8;
return od_ec_decode_bool_q15(&r->ec, p);
}
}
static INLINE int aom_daala_read_bit(daala_reader *r) {
return aom_daala_read(r, 128);
}
static INLINE int aom_daala_reader_has_error(daala_reader *r) {
return r->ec.error;
}
static INLINE int daala_read_tree_bits(daala_reader *r,
const aom_tree_index *tree,
const aom_prob *probs) {
aom_tree_index i = 0;
do {
aom_cdf_prob cdf[16];
aom_tree_index index[16];
int path[16];
int dist[16];
int nsymbs;
int symb;
nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist);
symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
OD_ASSERT(symb >= 0 && symb < nsymbs);
i = index[symb];
} while (i > 0);
return -i;
}
static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
int nsymbs) {
return od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif

View File

@@ -1,32 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <string.h>
#include "aom_dsp/daalaboolwriter.h"
void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
br->buffer = source;
br->pos = 0;
od_ec_enc_init(&br->ec, 62025);
}
void aom_daala_stop_encode(daala_writer *br) {
uint32_t daala_bytes;
unsigned char *daala_data;
daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
memcpy(br->buffer, daala_data, daala_bytes);
br->pos = daala_bytes;
/* Prevent ec bitstream from being detected as a superframe marker.
Must always be added, so that rawbits knows the exact length of the
bitstream. */
br->buffer[br->pos++] = 0;
od_ec_enc_clear(&br->ec);
}

View File

@@ -1,90 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_DAALABOOLWRITER_H_
#define AOM_DSP_DAALABOOLWRITER_H_
#include "aom_dsp/entenc.h"
#include "aom_dsp/prob.h"
#ifdef __cplusplus
extern "C" {
#endif
struct daala_writer {
unsigned int pos;
uint8_t *buffer;
od_ec_enc ec;
};
typedef struct daala_writer daala_writer;
void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
void aom_daala_stop_encode(daala_writer *w);
static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
if (prob == 128) {
od_ec_enc_bits(&w->ec, bit, 1);
} else {
int p = ((prob << 15) + (256 - prob)) >> 8;
od_ec_encode_bool_q15(&w->ec, bit, p);
}
}
static INLINE void daala_write_tree_bits(daala_writer *w,
const aom_tree_index *tree,
const aom_prob *probs, int bits,
int len, aom_tree_index i) {
aom_tree_index root;
root = i;
do {
aom_cdf_prob cdf[16];
aom_tree_index index[16];
int path[16];
int dist[16];
int nsymbs;
int symb;
int j;
/* Compute the CDF of the binary tree using the given probabilities. */
nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist);
/* Find the symbol to code. */
symb = -1;
for (j = 0; j < nsymbs; j++) {
/* If this symbol codes a leaf node, */
if (index[j] <= 0) {
if (len == dist[j] && path[j] == bits) {
symb = j;
break;
}
} else {
if (len > dist[j] && path[j] == bits >> (len - dist[j])) {
symb = j;
break;
}
}
}
OD_ASSERT(symb != -1);
od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
bits &= (1 << (len - dist[symb])) - 1;
len -= dist[symb];
} while (len);
}
static INLINE void daala_write_symbol(daala_writer *w, int symb,
const aom_cdf_prob *cdf, int nsymbs) {
od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif

View File

@@ -1,195 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*
*/
#include <stdlib.h>
#include "aom/aom_integer.h"
const int16_t aom_rv[] = {
8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14,
4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0,
3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8,
2, 9, 7, 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3,
2, 0, 13, 13, 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, 4, 14, 4, 10, 0,
8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, 0, 10, 0, 5,
13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3, 4, 7,
3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1,
12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9,
6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2,
0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3, 10, 5, 8, 0, 11, 6,
7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10, 8, 9,
4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2,
7, 2, 2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3,
0, 11, 8, 13, 1, 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7,
1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, 4, 4, 0,
8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, 11, 12,
12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0,
3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
4, 3, 5, 6, 10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12,
3, 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6,
2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, 0, 9, 5, 5, 11, 10, 13,
9, 10, 13,
};
void aom_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line, int cols,
unsigned char *f, int size) {
unsigned char *p_src, *p_dst;
int row;
int col;
unsigned char v;
unsigned char d[4];
for (row = 0; row < size; row++) {
/* post_proc_down for one row */
p_src = src_ptr;
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
unsigned char p_above1 = p_src[col - src_pixels_per_line];
unsigned char p_below1 = p_src[col + src_pixels_per_line];
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
v = p_src[col];
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
(abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
p_dst[col] = v;
}
/* now post_proc_across */
p_src = dst_ptr;
p_dst = dst_ptr;
p_src[-2] = p_src[-1] = p_src[0];
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
for (col = 0; col < cols; col++) {
v = p_src[col];
if ((abs(v - p_src[col - 2]) < f[col]) &&
(abs(v - p_src[col - 1]) < f[col]) &&
(abs(v - p_src[col + 1]) < f[col]) &&
(abs(v - p_src[col + 2]) < f[col])) {
unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
k3 = (k1 + k2 + 1) >> 1;
v = (k3 + v + 1) >> 1;
}
d[col & 3] = v;
if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
}
/* handle the last two pixels */
p_dst[col - 2] = d[(col - 2) & 3];
p_dst[col - 1] = d[(col - 1) & 3];
/* next row */
src_ptr += src_pixels_per_line;
dst_ptr += dst_pixels_per_line;
}
}
void aom_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
int cols, int flimit) {
int r, c, i;
unsigned char *s = src;
unsigned char d[16];
for (r = 0; r < rows; r++) {
int sumsq = 0;
int sum = 0;
for (i = -8; i < 0; i++) s[i] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
for (i = -8; i <= 6; i++) {
sumsq += s[i] * s[i];
sum += s[i];
d[i + 8] = 0;
}
for (c = 0; c < cols + 8; c++) {
int x = s[c + 7] - s[c - 8];
int y = s[c + 7] + s[c - 8];
sum += x;
sumsq += x * y;
d[c & 15] = s[c];
if (sumsq * 15 - sum * sum < flimit) {
d[c & 15] = (8 + sum + s[c]) >> 4;
}
s[c - 8] = d[(c - 8) & 15];
}
s += pitch;
}
}
void aom_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
int flimit) {
int r, c, i;
const int16_t *rv3 = &aom_rv[63 & rand()];
for (c = 0; c < cols; c++) {
unsigned char *s = &dst[c];
int sumsq = 0;
int sum = 0;
unsigned char d[16];
const int16_t *rv2 = rv3 + ((c * 17) & 127);
for (i = -8; i < 0; i++) s[i * pitch] = s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
for (i = -8; i <= 6; i++) {
sumsq += s[i * pitch] * s[i * pitch];
sum += s[i * pitch];
}
for (r = 0; r < rows + 8; r++) {
sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
sum += s[7 * pitch] - s[-8 * pitch];
d[r & 15] = s[0];
if (sumsq * 15 - sum * sum < flimit) {
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
}
if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
s += pitch;
}
}
}

View File

@@ -1,108 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include "./aom_config.h"
#include "aom_dsp/dkboolreader.h"
#include "aom_dsp/prob.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
#include "aom_mem/aom_mem.h"
#include "aom_util/endian_inl.h"
static INLINE int aom_dk_read_bit(struct aom_dk_reader *r) {
return aom_dk_read(r, 128); // aom_prob_half
}
int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer,
size_t size, aom_decrypt_cb decrypt_cb,
void *decrypt_state) {
if (size && !buffer) {
return 1;
} else {
r->buffer_end = buffer + size;
r->buffer_start = r->buffer = buffer;
r->value = 0;
r->count = -8;
r->range = 255;
r->decrypt_cb = decrypt_cb;
r->decrypt_state = decrypt_state;
aom_dk_reader_fill(r);
#if CONFIG_ACCOUNTING
r->accounting = NULL;
#endif
return aom_dk_read_bit(r) != 0; // marker bit
}
}
void aom_dk_reader_fill(struct aom_dk_reader *r) {
const uint8_t *const buffer_end = r->buffer_end;
const uint8_t *buffer = r->buffer;
const uint8_t *buffer_start = buffer;
BD_VALUE value = r->value;
int count = r->count;
const size_t bytes_left = buffer_end - buffer;
const size_t bits_left = bytes_left * CHAR_BIT;
int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
if (r->decrypt_cb) {
size_t n = AOMMIN(sizeof(r->clear_buffer), bytes_left);
r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
buffer = r->clear_buffer;
buffer_start = r->clear_buffer;
}
if (bits_left > BD_VALUE_SIZE) {
const int bits = (shift & 0xfffffff8) + CHAR_BIT;
BD_VALUE nv;
BD_VALUE big_endian_values;
memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
#if SIZE_MAX == 0xffffffffffffffffULL
big_endian_values = HToBE64(big_endian_values);
#else
big_endian_values = HToBE32(big_endian_values);
#endif
nv = big_endian_values >> (BD_VALUE_SIZE - bits);
count += bits;
buffer += (bits >> 3);
value = r->value | (nv << (shift & 0x7));
} else {
const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
int loop_end = 0;
if (bits_over >= 0) {
count += LOTS_OF_BITS;
loop_end = bits_over;
}
if (bits_over < 0 || bits_left) {
while (shift >= loop_end) {
count += CHAR_BIT;
value |= (BD_VALUE)*buffer++ << shift;
shift -= CHAR_BIT;
}
}
}
// NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption,
// so we increase 'r->buffer' by the amount that 'buffer' moved, rather than
// assign 'buffer' to 'r->buffer'.
r->buffer += buffer - buffer_start;
r->value = value;
r->count = count;
}
const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r) {
// Find the end of the coded buffer
while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) {
r->count -= CHAR_BIT;
r->buffer--;
}
return r->buffer;
}

View File

@@ -1,180 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_DKBOOLREADER_H_
#define AOM_DSP_DKBOOLREADER_H_
#include <assert.h>
#include <stddef.h>
#include <limits.h>
#include "./aom_config.h"
#if CONFIG_BITSTREAM_DEBUG
#include <assert.h>
#include <stdio.h>
#include "aom_util/debug_util.h"
#endif // CONFIG_BITSTREAM_DEBUG
#include "aom_ports/mem.h"
#include "aom/aomdx.h"
#include "aom/aom_integer.h"
#include "aom_dsp/prob.h"
#if CONFIG_ACCOUNTING
#include "av1/common/accounting.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef size_t BD_VALUE;
#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
// This is meant to be a large, positive constant that can still be efficiently
// loaded as an immediate (on platforms like ARM, for example).
// Even relatively modest values like 100 would work fine.
#define LOTS_OF_BITS 0x40000000
struct aom_dk_reader {
// Be careful when reordering this struct, it may impact the cache negatively.
BD_VALUE value;
unsigned int range;
int count;
const uint8_t *buffer_start;
const uint8_t *buffer_end;
const uint8_t *buffer;
aom_decrypt_cb decrypt_cb;
void *decrypt_state;
uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
#if CONFIG_ACCOUNTING
Accounting *accounting;
#endif
};
int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer,
size_t size, aom_decrypt_cb decrypt_cb,
void *decrypt_state);
void aom_dk_reader_fill(struct aom_dk_reader *r);
const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r);
static INLINE uint32_t aom_dk_reader_tell(const struct aom_dk_reader *r) {
const uint32_t bits_read = (r->buffer - r->buffer_start) * CHAR_BIT;
const int count =
(r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS;
assert(r->buffer >= r->buffer_start);
return bits_read - (count + CHAR_BIT);
}
/*The resolution of fractional-precision bit usage measurements, i.e.,
3 => 1/8th bits.*/
#define DK_BITRES (3)
static INLINE uint32_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) {
uint32_t num_bits;
uint32_t range;
int l;
int i;
num_bits = aom_dk_reader_tell(r) << DK_BITRES;
range = r->range;
l = 0;
for (i = DK_BITRES; i-- > 0;) {
int b;
range = range * range >> 7;
b = (int)(range >> 8);
l = l << 1 | b;
range >>= b;
}
return num_bits - l;
}
static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) {
// Check if we have reached the end of the buffer.
//
// Variable 'count' stores the number of bits in the 'value' buffer, minus
// 8. The top byte is part of the algorithm, and the remainder is buffered
// to be shifted into it. So if count == 8, the top 16 bits of 'value' are
// occupied, 8 for the algorithm and 8 in the buffer.
//
// When reading a byte from the user's buffer, count is filled with 8 and
// one byte is filled into the value buffer. When we reach the end of the
// data, count is additionally filled with LOTS_OF_BITS. So when
// count == LOTS_OF_BITS - 1, the user's data has been exhausted.
//
// 1 if we have tried to decode bits after the end of stream was encountered.
// 0 No error.
return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
}
static INLINE int aom_dk_read(struct aom_dk_reader *r, int prob) {
unsigned int bit = 0;
BD_VALUE value;
BD_VALUE bigsplit;
int count;
unsigned int range;
unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
if (r->count < 0) aom_dk_reader_fill(r);
value = r->value;
count = r->count;
bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
range = split;
if (value >= bigsplit) {
range = r->range - split;
value = value - bigsplit;
bit = 1;
}
{
register int shift = aom_norm[range];
range <<= shift;
value <<= shift;
count -= shift;
}
r->value = value;
r->count = count;
r->range = range;
#if CONFIG_BITSTREAM_DEBUG
{
int ref_bit, ref_prob;
const int queue_r = bitstream_queue_get_read();
const int frame_idx = bitstream_queue_get_frame_read();
bitstream_queue_pop(&ref_bit, &ref_prob);
if (prob != ref_prob) {
fprintf(
stderr,
"\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n",
frame_idx, prob, ref_prob, queue_r);
assert(0);
}
if ((int)bit != ref_bit) {
fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n",
frame_idx, bit, ref_bit);
assert(0);
}
}
#endif // CONFIG_BITSTREAM_DEBUG
return bit;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_DKBOOLREADER_H_

View File

@@ -1,44 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#include <assert.h>
#include "./dkboolwriter.h"
static INLINE void aom_dk_write_bit(aom_dk_writer *w, int bit) {
aom_dk_write(w, bit, 128); // aom_prob_half
}
void aom_dk_start_encode(aom_dk_writer *br, uint8_t *source) {
br->lowvalue = 0;
br->range = 255;
br->count = -24;
br->buffer = source;
br->pos = 0;
aom_dk_write_bit(br, 0);
}
void aom_dk_stop_encode(aom_dk_writer *br) {
int i;
#if CONFIG_BITSTREAM_DEBUG
bitstream_queue_set_skip_write(1);
#endif // CONFIG_BITSTREAM_DEBUG
for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0);
#if CONFIG_BITSTREAM_DEBUG
bitstream_queue_set_skip_write(0);
#endif // CONFIG_BITSTREAM_DEBUG
// Ensure there's no ambigous collision with any index marker bytes
if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
}

View File

@@ -1,104 +0,0 @@
/*
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_DSP_DKBOOLWRITER_H_
#define AOM_DSP_DKBOOLWRITER_H_
#include "./aom_config.h"
#if CONFIG_BITSTREAM_DEBUG
#include <stdio.h>
#include "aom_util/debug_util.h"
#endif // CONFIG_BITSTREAM_DEBUG
#include "aom_dsp/prob.h"
#include "aom_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct aom_dk_writer {
unsigned int lowvalue;
unsigned int range;
int count;
unsigned int pos;
uint8_t *buffer;
} aom_dk_writer;
void aom_dk_start_encode(aom_dk_writer *bc, uint8_t *buffer);
void aom_dk_stop_encode(aom_dk_writer *bc);
static INLINE void aom_dk_write(aom_dk_writer *br, int bit, int probability) {
unsigned int split;
int count = br->count;
unsigned int range = br->range;
unsigned int lowvalue = br->lowvalue;
register int shift;
#if CONFIG_BITSTREAM_DEBUG
// int queue_r = 0;
// int frame_idx_r = 0;
// int queue_w = bitstream_queue_get_write();
// int frame_idx_w = bitstream_queue_get_frame_write();
// if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
// fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
// frame_idx_w, queue_w);
// }
bitstream_queue_push(bit, probability);
#endif // CONFIG_BITSTREAM_DEBUG
split = 1 + (((range - 1) * probability) >> 8);
range = split;
if (bit) {
lowvalue += split;
range = br->range - split;
}
shift = aom_norm[range];
range <<= shift;
count += shift;
if (count >= 0) {
int offset = shift - count;
if ((lowvalue << (offset - 1)) & 0x80000000) {
int x = br->pos - 1;
while (x >= 0 && br->buffer[x] == 0xff) {
br->buffer[x] = 0;
x--;
}
br->buffer[x] += 1;
}
br->buffer[br->pos++] = (lowvalue >> (24 - offset));
lowvalue <<= offset;
shift = count;
lowvalue &= 0xffffff;
count -= 8;
}
lowvalue <<= shift;
br->count = count;
br->lowvalue = lowvalue;
br->range = range;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // AOM_DSP_DKBOOLWRITER_H_

View File

@@ -1,80 +0,0 @@
/*
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifdef HAVE_CONFIG_H
#include "./config.h"
#endif
#include "aom_dsp/entcode.h"
/*CDFs for uniform probability distributions of small sizes (2 through 16,
inclusive).*/
// clang-format off
const uint16_t OD_UNIFORM_CDFS_Q15[135] = {
16384, 32768,
10923, 21845, 32768,
8192, 16384, 24576, 32768,
6554, 13107, 19661, 26214, 32768,
5461, 10923, 16384, 21845, 27307, 32768,
4681, 9362, 14043, 18725, 23406, 28087, 32768,
4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768,
3641, 7282, 10923, 14564, 18204, 21845, 25486, 29127, 32768,
3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491, 32768,
2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789, 32768,
2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037,
32768,
2521, 5041, 7562, 10082, 12603, 15124, 17644, 20165, 22686, 25206, 27727,
30247, 32768,
2341, 4681, 7022, 9362, 11703, 14043, 16384, 18725, 21065, 23406, 25746,
28087, 30427, 32768,
2185, 4369, 6554, 8738, 10923, 13107, 15292, 17476, 19661, 21845, 24030,
26214, 28399, 30583, 32768,
2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528,
24576, 26624, 28672, 30720, 32768
};
// clang-format on
/*Given the current total integer number of bits used and the current value of
rng, computes the fraction number of bits used to OD_BITRES precision.
This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
nbits_total: The number of whole bits currently used, i.e., the value
returned by od_ec_enc_tell() or od_ec_dec_tell().
rng: The current value of rng from either the encoder or decoder state.
Return: The number of bits scaled by 2**OD_BITRES.
This will always be slightly larger than the exact value (e.g., all
rounding error is in the positive direction).*/
uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
uint32_t nbits;
int l;
int i;
/*To handle the non-integral number of bits still left in the encoder/decoder
state, we compute the worst-case number of bits of val that must be
encoded to ensure that the value is inside the range for any possible
subsequent bits.
The computation here is independent of val itself (the decoder does not
even track that value), even though the real number of bits used after
od_ec_enc_done() may be 1 smaller if rng is a power of two and the
corresponding trailing bits of val are all zeros.
If we did try to track that special case, then coding a value with a
probability of 1/(1 << n) might sometimes appear to use more than n bits.
This may help explain the surprising result that a newly initialized
encoder or decoder claims to have used 1 bit.*/
nbits = nbits_total << OD_BITRES;
l = 0;
for (i = OD_BITRES; i-- > 0;) {
int b;
rng = rng * rng >> 15;
b = (int)(rng >> 16);
l = l << 1 | b;
rng >>= b;
}
return nbits - l;
}

View File

@@ -1,105 +0,0 @@
/*
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#if !defined(_entcode_H)
#define _entcode_H (1)
#include <limits.h>
#include <stddef.h>
#include "av1/common/odintrin.h"
/*Set this flag 1 to enable a "reduced overhead" version of the entropy coder.
This uses a partition function that more accurately follows the input
probability estimates at the expense of some additional CPU cost (though
still an order of magnitude less than a full division).
In classic arithmetic coding, the partition function maps a value x in the
range [0, ft] to a value in y in [0, r] with 0 < ft <= r via
y = x*r/ft.
Any deviation from this value increases coding inefficiency.
To avoid divisions, we require ft <= r < 2*ft (enforcing it by shifting up
ft if necessary), and replace that function with
y = x + OD_MINI(x, r - ft).
This counts values of x smaller than r - ft double compared to values larger
than r - ft, which over-estimates the probability of symbols at the start of
the alphabet, and under-estimates the probability of symbols at the end of
the alphabet.
The overall coding inefficiency assuming accurate probability models and
independent symbols is in the 1% range, which is similar to that of CABAC.
To reduce overhead even further, we split this into two cases:
1) r - ft > ft - (r - ft).
That is, we have more values of x that are double-counted than
single-counted.
In this case, we still double-count the first 2*r - 3*ft values of x, but
after that we alternate between single-counting and double-counting for
the rest.
2) r - ft < ft - (r - ft).
That is, we have more values of x that are single-counted than
double-counted.
In this case, we alternate between single-counting and double-counting for
the first 2*(r - ft) values of x, and single-count the rest.
For two equiprobable symbols in different places in the alphabet, this
reduces the maximum ratio of over-estimation to under-estimation from 2:1
for the previous partition function to either 4:3 or 3:2 (for each of the
two cases above, respectively), assuming symbol probabilities significantly
greater than 1/32768.
That reduces the worst-case per-symbol overhead from 1 bit to 0.58 bits.
The resulting function is
e = OD_MAXI(2*r - 3*ft, 0);
y = x + OD_MINI(x, e) + OD_MINI(OD_MAXI(x - e, 0) >> 1, r - ft).
Here, e is a value that is greater than 0 in case 1, and 0 in case 2.
This function is about 3 times as expensive to evaluate as the high-overhead
version, but still an order of magnitude cheaper than a division, since it
is composed only of very simple operations.
Because we want to fit in 16-bit registers and must use unsigned values to do
so, we use saturating subtraction to enforce the maximums with 0.
Enabling this reduces the measured overhead in ectest from 0.805% to 0.621%
(vs. 0.022% for the division-based partition function with r much greater
than ft).
It improves performance on ntt-short-1 by about 0.3%.*/
#define OD_EC_REDUCED_OVERHEAD (1)
/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
on a larger type, you can speed up the decoder by using it here.*/
typedef uint32_t od_ec_window;
#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
/*Unsigned subtraction with unsigned saturation.
This implementation of the macro is intentionally chosen to increase the
number of common subexpressions in the reduced-overhead partition function.
This matters for C code, but it would not for hardware with a saturating
subtraction instruction.*/
#define OD_SUBSATU(a, b) ((a)-OD_MINI(a, b))
/*The number of bits to use for the range-coded part of unsigned integers.*/
#define OD_EC_UINT_BITS (4)
/*The resolution of fractional-precision bit usage measurements, i.e.,
3 => 1/8th bits.*/
#define OD_BITRES (3)
extern const uint16_t OD_UNIFORM_CDFS_Q15[135];
/*Returns a Q15 CDF for a uniform probability distribution of the given size.
n: The size of the distribution.
This must be at least 2, and no more than 16.*/
#define OD_UNIFORM_CDF_Q15(n) (OD_UNIFORM_CDFS_Q15 + ((n) * ((n)-1) >> 1) - 1)
/*See entcode.c for further documentation.*/
OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
uint32_t rng);
#endif

View File

@@ -1,494 +0,0 @@
/*
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifdef HAVE_CONFIG_H
#include "./config.h"
#endif
#include "aom_dsp/entdec.h"
/*A range decoder.
This is an entropy decoder based upon \cite{Mar79}, which is itself a
rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
It is very similar to arithmetic encoding, except that encoding is done with
digits in any base, instead of with bits, and so it is faster when using
larger bases (i.e.: a byte).
The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
is the base, longer than the theoretical optimum, but to my knowledge there
is no published justification for this claim.
This only seems true when using near-infinite precision arithmetic so that
the process is carried out with no rounding errors.
An excellent description of implementation details is available at
http://www.arturocampos.com/ac_range.html
A recent work \cite{MNW98} which proposes several changes to arithmetic
encoding for efficiency actually re-discovers many of the principles
behind range encoding, and presents a good theoretical analysis of them.
End of stream is handled by writing out the smallest number of bits that
ensures that the stream will be correctly decoded regardless of the value of
any subsequent bits.
od_ec_dec_tell() can be used to determine how many bits were needed to decode
all the symbols thus far; other data can be packed in the remaining bits of
the input buffer.
@PHDTHESIS{Pas76,
author="Richard Clark Pasco",
title="Source coding algorithms for fast data compression",
school="Dept. of Electrical Engineering, Stanford University",
address="Stanford, CA",
month=May,
year=1976,
URL="http://www.richpasco.org/scaffdc.pdf"
}
@INPROCEEDINGS{Mar79,
author="Martin, G.N.N.",
title="Range encoding: an algorithm for removing redundancy from a digitised
message",
booktitle="Video & Data Recording Conference",
year=1979,
address="Southampton",
month=Jul,
URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
}
@ARTICLE{MNW98,
author="Alistair Moffat and Radford Neal and Ian H. Witten",
title="Arithmetic Coding Revisited",
journal="{ACM} Transactions on Information Systems",
year=1998,
volume=16,
number=3,
pages="256--294",
month=Jul,
URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
}*/
/*This is meant to be a large, positive constant that can still be efficiently
loaded as an immediate (on platforms like ARM, for example).
Even relatively modest values like 100 would work fine.*/
#define OD_EC_LOTS_OF_BITS (0x4000)
static void od_ec_dec_refill(od_ec_dec *dec) {
int s;
od_ec_window dif;
int16_t cnt;
const unsigned char *bptr;
const unsigned char *end;
dif = dec->dif;
cnt = dec->cnt;
bptr = dec->bptr;
end = dec->end;
s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
for (; s >= 0 && bptr < end; s -= 8, bptr++) {
OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8);
dif |= (od_ec_window)bptr[0] << s;
cnt += 8;
}
if (bptr >= end) {
dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
cnt = OD_EC_LOTS_OF_BITS;
}
dec->dif = dif;
dec->cnt = cnt;
dec->bptr = bptr;
}
/*Takes updated dif and range values, renormalizes them so that
32768 <= rng < 65536 (reading more bytes from the stream into dif if
necessary), and stores them back in the decoder context.
dif: The new value of dif.
rng: The new value of the range.
ret: The value to return.
Return: ret.
This allows the compiler to jump to this function via a tail-call.*/
static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
int ret) {
int d;
OD_ASSERT(rng <= 65535U);
d = 16 - OD_ILOG_NZ(rng);
dec->cnt -= d;
dec->dif = dif << d;
dec->rng = rng << d;
if (dec->cnt < 0) od_ec_dec_refill(dec);
return ret;
}
/*Initializes the decoder.
buf: The input buffer to use.
Return: 0 on success, or a negative value on error.*/
void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
uint32_t storage) {
dec->buf = buf;
dec->eptr = buf + storage;
dec->end_window = 0;
dec->nend_bits = 0;
dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
dec->end = buf + storage;
dec->bptr = buf;
dec->dif = 0;
dec->rng = 0x8000;
dec->cnt = -15;
dec->error = 0;
od_ec_dec_refill(dec);
}
/*Decode a bit that has an fz/ft probability of being a zero.
fz: The probability that the bit is zero, scaled by _ft.
ft: The total probability.
This must be at least 16384 and no more than 32768.
Return: The value decoded (0 or 1).*/
int od_ec_decode_bool(od_ec_dec *dec, unsigned fz, unsigned ft) {
od_ec_window dif;
od_ec_window vw;
unsigned r;
int s;
unsigned v;
int ret;
OD_ASSERT(0 < fz);
OD_ASSERT(fz < ft);
OD_ASSERT(16384 <= ft);
OD_ASSERT(ft <= 32768U);
dif = dec->dif;
r = dec->rng;
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
OD_ASSERT(ft <= r);
s = r - ft >= ft;
ft <<= s;
fz <<= s;
OD_ASSERT(r - ft < ft);
#if OD_EC_REDUCED_OVERHEAD
{
unsigned d;
unsigned e;
d = r - ft;
e = OD_SUBSATU(2 * d, ft);
v = fz + OD_MINI(fz, e) + OD_MINI(OD_SUBSATU(fz, e) >> 1, d);
}
#else
v = fz + OD_MINI(fz, r - ft);
#endif
vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
ret = dif >= vw;
if (ret) dif -= vw;
r = ret ? r - v : v;
return od_ec_dec_normalize(dec, dif, r, ret);
}
/*Decode a bit that has an fz probability of being a zero in Q15.
This is a simpler, lower overhead version of od_ec_decode_bool() for use when
ft == 32768.
To be decoded properly by this function, symbols cannot have been encoded by
od_ec_encode(), but must have been encoded with one of the equivalent _q15()
or _dyadic() functions instead.
fz: The probability that the bit is zero, scaled by 32768.
Return: The value decoded (0 or 1).*/
int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned fz) {
od_ec_window dif;
od_ec_window vw;
unsigned r;
unsigned r_new;
unsigned v;
int ret;
OD_ASSERT(0 < fz);
OD_ASSERT(fz < 32768U);
dif = dec->dif;
r = dec->rng;
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
OD_ASSERT(32768U <= r);
v = fz * (uint32_t)r >> 15;
vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
ret = 0;
r_new = v;
if (dif >= vw) {
r_new = r - v;
dif -= vw;
ret = 1;
}
return od_ec_dec_normalize(dec, dif, r_new, ret);
}
/*Decodes a symbol given a cumulative distribution function (CDF) table.
cdf: The CDF, such that symbol s falls in the range
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
The values must be monotonically non-increasing, and cdf[nsyms - 1]
must be at least 16384, and no more than 32768.
nsyms: The number of symbols in the alphabet.
This should be at most 16.
Return: The decoded symbol s.*/
int od_ec_decode_cdf(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
od_ec_window dif;
unsigned r;
unsigned c;
unsigned d;
#if OD_EC_REDUCED_OVERHEAD
unsigned e;
#endif
int s;
unsigned u;
unsigned v;
unsigned q;
unsigned fl;
unsigned fh;
unsigned ft;
int ret;
dif = dec->dif;
r = dec->rng;
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
OD_ASSERT(nsyms > 0);
ft = cdf[nsyms - 1];
OD_ASSERT(16384 <= ft);
OD_ASSERT(ft <= 32768U);
OD_ASSERT(ft <= r);
s = r - ft >= ft;
ft <<= s;
d = r - ft;
OD_ASSERT(d < ft);
c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
q = OD_MAXI((int)(c >> 1), (int)(c - d));
#if OD_EC_REDUCED_OVERHEAD
e = OD_SUBSATU(2 * d, ft);
/*The correctness of this inverse partition function is not obvious, but it
was checked exhaustively for all possible values of r, ft, and c.
TODO: It should be possible to optimize this better than the compiler,
given that we do not care about the accuracy of negative results (as we
will not use them).
It would also be nice to get rid of the 32-bit dividend, as it requires a
32x32->64 bit multiply to invert.*/
q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
#endif
q >>= s;
OD_ASSERT(q<ft>> s);
fl = 0;
ret = 0;
for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
OD_ASSERT(fh <= ft >> s);
fl <<= s;
fh <<= s;
#if OD_EC_REDUCED_OVERHEAD
u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
#else
u = fl + OD_MINI(fl, d);
v = fh + OD_MINI(fh, d);
#endif
r = v - u;
dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
return od_ec_dec_normalize(dec, dif, r, ret);
}
/*Decodes a symbol given a cumulative distribution function (CDF) table.
cdf: The CDF, such that symbol s falls in the range
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
The values must be monotonically non-increasing, and cdf[nsyms - 1]
must be at least 2, and no more than 32768.
nsyms: The number of symbols in the alphabet.
This should be at most 16.
Return: The decoded symbol s.*/
int od_ec_decode_cdf_unscaled(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
od_ec_window dif;
unsigned r;
unsigned c;
unsigned d;
#if OD_EC_REDUCED_OVERHEAD
unsigned e;
#endif
int s;
unsigned u;
unsigned v;
unsigned q;
unsigned fl;
unsigned fh;
unsigned ft;
int ret;
dif = dec->dif;
r = dec->rng;
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
OD_ASSERT(nsyms > 0);
ft = cdf[nsyms - 1];
OD_ASSERT(2 <= ft);
OD_ASSERT(ft <= 32768U);
s = 15 - OD_ILOG_NZ(ft - 1);
ft <<= s;
OD_ASSERT(ft <= r);
if (r - ft >= ft) {
ft <<= 1;
s++;
}
d = r - ft;
OD_ASSERT(d < ft);
c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
q = OD_MAXI((int)(c >> 1), (int)(c - d));
#if OD_EC_REDUCED_OVERHEAD
e = OD_SUBSATU(2 * d, ft);
/*TODO: See TODO above.*/
q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
#endif
q >>= s;
OD_ASSERT(q<ft>> s);
fl = 0;
ret = 0;
for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
OD_ASSERT(fh <= ft >> s);
fl <<= s;
fh <<= s;
#if OD_EC_REDUCED_OVERHEAD
u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
#else
u = fl + OD_MINI(fl, d);
v = fh + OD_MINI(fh, d);
#endif
r = v - u;
dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
return od_ec_dec_normalize(dec, dif, r, ret);
}
/*Decodes a symbol given a cumulative distribution function (CDF) table that
sums to a power of two.
This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
cdf[nsyms - 1] is a power of two.
To be decoded properly by this function, symbols cannot have been encoded by
od_ec_encode(), but must have been encoded with one of the equivalent _q15()
functions instead.
cdf: The CDF, such that symbol s falls in the range
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
The values must be monotonically non-increasing, and cdf[nsyms - 1]
must be exactly 1 << ftb.
nsyms: The number of symbols in the alphabet.
This should be at most 16.
ftb: The number of bits of precision in the cumulative distribution.
This must be no more than 15.
Return: The decoded symbol s.*/
int od_ec_decode_cdf_unscaled_dyadic(od_ec_dec *dec, const uint16_t *cdf,
int nsyms, unsigned ftb) {
od_ec_window dif;
unsigned r;
unsigned c;
unsigned u;
unsigned v;
int ret;
(void)nsyms;
dif = dec->dif;
r = dec->rng;
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
OD_ASSERT(ftb <= 15);
OD_ASSERT(cdf[nsyms - 1] == 1U << ftb);
OD_ASSERT(32768U <= r);
c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
v = 0;
ret = -1;
do {
u = v;
v = cdf[++ret] * (uint32_t)r >> ftb;
} while (v <= c);
OD_ASSERT(v <= r);
r = v - u;
dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
return od_ec_dec_normalize(dec, dif, r, ret);
}
/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15.
This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
cdf[nsyms - 1] == 32768.
To be decoded properly by this function, symbols cannot have been encoded by
od_ec_encode(), but must have been encoded with one of the equivalent _q15()
or dyadic() functions instead.
cdf: The CDF, such that symbol s falls in the range
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
The values must be monotonically non-increasing, and cdf[nsyms - 1]
must be 32768.
nsyms: The number of symbols in the alphabet.
This should be at most 16.
Return: The decoded symbol s.*/
int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
return od_ec_decode_cdf_unscaled_dyadic(dec, cdf, nsyms, 15);
}
/*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
The integer must have been encoded with od_ec_enc_uint().
ft: The number of integers that can be decoded (one more than the max).
This must be at least 2, and no more than 2**29.
Return: The decoded bits.*/
uint32_t od_ec_dec_uint(od_ec_dec *dec, uint32_t ft) {
OD_ASSERT(ft >= 2);
OD_ASSERT(ft <= (uint32_t)1 << (25 + OD_EC_UINT_BITS));
if (ft > 1U << OD_EC_UINT_BITS) {
uint32_t t;
int ft1;
int ftb;
ft--;
ftb = OD_ILOG_NZ(ft) - OD_EC_UINT_BITS;
ft1 = (int)(ft >> ftb) + 1;
t = od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft1), ft1);
t = t << ftb | od_ec_dec_bits(dec, ftb, "");
if (t <= ft) return t;
dec->error = 1;
return ft;
}
return od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft), (int)ft);
}
/*Extracts a sequence of raw bits from the stream.
The bits must have been encoded with od_ec_enc_bits().
ftb: The number of bits to extract.
This must be between 0 and 25, inclusive.
Return: The decoded bits.*/
uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
od_ec_window window;
int available;
uint32_t ret;
OD_ASSERT(ftb <= 25);
window = dec->end_window;
available = dec->nend_bits;
if ((unsigned)available < ftb) {
const unsigned char *buf;
const unsigned char *eptr;
buf = dec->buf;
eptr = dec->eptr;
OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8);
do {
if (eptr <= buf) {
dec->tell_offs += OD_EC_LOTS_OF_BITS - available;
available = OD_EC_LOTS_OF_BITS;
break;
}
window |= (od_ec_window) * --eptr << available;
available += 8;
} while (available <= OD_EC_WINDOW_SIZE - 8);
dec->eptr = eptr;
}
ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1);
window >>= ftb;
available -= ftb;
dec->end_window = window;
dec->nend_bits = available;
return ret;
}
/*Returns the number of bits "used" by the decoded symbols so far.
This same number can be computed in either the encoder or the decoder, and is
suitable for making coding decisions.
Return: The number of bits.
This will always be slightly larger than the exact value (e.g., all
rounding error is in the positive direction).*/
int od_ec_dec_tell(const od_ec_dec *dec) {
return ((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 - dec->cnt -
dec->nend_bits + dec->tell_offs;
}
/*Returns the number of bits "used" by the decoded symbols so far.
This same number can be computed in either the encoder or the decoder, and is
suitable for making coding decisions.
Return: The number of bits scaled by 2**OD_BITRES.
This will always be slightly larger than the exact value (e.g., all
rounding error is in the positive direction).*/
uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
}

Some files were not shown because too many files have changed in this diff Show More