Compare commits
365 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8b4210940c | ||
|
|
82ea742237 | ||
|
|
f5bd76f5c1 | ||
|
|
de075a95e0 | ||
|
|
37169c0bd4 | ||
|
|
fe488cceff | ||
|
|
149d082377 | ||
|
|
8b4c31584e | ||
|
|
52db2b1690 | ||
|
|
33dedd0628 | ||
|
|
8be7e572a7 | ||
|
|
1c07abca18 | ||
|
|
37a39ac138 | ||
|
|
7eb7d6b227 | ||
|
|
1b982cc64f | ||
|
|
af3b0de732 | ||
|
|
ab7cd6d068 | ||
|
|
c2fe9acced | ||
|
|
f9efbad392 | ||
|
|
5d881770e5 | ||
|
|
de3b769524 | ||
|
|
fe4dd4f43f | ||
|
|
fafec95702 | ||
|
|
dfcefe06fa | ||
|
|
bd7cfb46fb | ||
|
|
f09b5a3328 | ||
|
|
a413dbe594 | ||
|
|
f1e12c1bf3 | ||
|
|
656f4a88cf | ||
|
|
343b6b09a1 | ||
|
|
4916a87bfc | ||
|
|
941fe20336 | ||
|
|
475e9d26e0 | ||
|
|
c37d012ada | ||
|
|
9e9722bc79 | ||
|
|
7a730d5901 | ||
|
|
cfd92dab18 | ||
|
|
6554333b59 | ||
|
|
6a8d4631a8 | ||
|
|
2c17d54681 | ||
|
|
9c9d92ae3a | ||
|
|
2d1e63d0c5 | ||
|
|
c12f2f3187 | ||
|
|
19d2e73dea | ||
|
|
ba42ce64b7 | ||
|
|
2fb826c4d5 | ||
|
|
7104833085 | ||
|
|
d772d55704 | ||
|
|
c79665d0ad | ||
|
|
1b1e40c0b2 | ||
|
|
958ae5af9c | ||
|
|
57f49db81f | ||
|
|
17720b60bb | ||
|
|
7f7c888c14 | ||
|
|
0325b95938 | ||
|
|
f4e4ce7549 | ||
|
|
7103b5307d | ||
|
|
8619203ddc | ||
|
|
b757d89ff9 | ||
|
|
4db9bd324d | ||
|
|
70a7885a65 | ||
|
|
caac87b05b | ||
|
|
d55724fae9 | ||
|
|
476e8fc855 | ||
|
|
36608af524 | ||
|
|
377cfa31f0 | ||
|
|
374f0ff4a0 | ||
|
|
3a4002b94d | ||
|
|
df69c751a7 | ||
|
|
bbf4c91f79 | ||
|
|
9fdeeaf411 | ||
|
|
7a79fa1362 | ||
|
|
b51d127c82 | ||
|
|
15f29ef092 | ||
|
|
77f5c3d2e8 | ||
|
|
5ea8712b82 | ||
|
|
068281751c | ||
|
|
a412c004e4 | ||
|
|
a7a8e07a44 | ||
|
|
ff0a87ce38 | ||
|
|
08131055e4 | ||
|
|
85e111b3ba | ||
|
|
8ff40f8bec | ||
|
|
e0cc52db3f | ||
|
|
f7032713af | ||
|
|
d089ac4dda | ||
|
|
3a04c9c9c4 | ||
|
|
039f9e08f0 | ||
|
|
dc5618f3bb | ||
|
|
004eebed31 | ||
|
|
2c3807b89f | ||
|
|
e446ffda45 | ||
|
|
d4ab234869 | ||
|
|
407c2e2974 | ||
|
|
6fbb4c3061 | ||
|
|
462a7c9f0a | ||
|
|
c0241664aa | ||
|
|
4508eb3123 | ||
|
|
956af1d478 | ||
|
|
6b374abc86 | ||
|
|
335cf67d8b | ||
|
|
341919d038 | ||
|
|
33e40cb5db | ||
|
|
47cc64cdf8 | ||
|
|
e4290800b2 | ||
|
|
f4be884466 | ||
|
|
fbf256da41 | ||
|
|
34201e50c1 | ||
|
|
1a3c4f91f6 | ||
|
|
9f9a8d2aaa | ||
|
|
1c85230344 | ||
|
|
7987686397 | ||
|
|
6565c17f24 | ||
|
|
f8c27d164c | ||
|
|
8ce67d714a | ||
|
|
d6c5ef4557 | ||
|
|
580f14b68b | ||
|
|
d7a3b781d3 | ||
|
|
099bd7f07e | ||
|
|
82070ae939 | ||
|
|
5c0f5cdda8 | ||
|
|
91369fd9b7 | ||
|
|
889ed5b158 | ||
|
|
7aa0c748b3 | ||
|
|
511bf49b7e | ||
|
|
ad5fea03e6 | ||
|
|
54b2071bf4 | ||
|
|
f368f86df6 | ||
|
|
c42d54c3a3 | ||
|
|
18e53642b7 | ||
|
|
5cf8eda308 | ||
|
|
256a4af4d1 | ||
|
|
7381e3330f | ||
|
|
0fff2fb34c | ||
|
|
b2542417cd | ||
|
|
3d3f51262c | ||
|
|
5e2791b54d | ||
|
|
9a62ecbd35 | ||
|
|
53db633349 | ||
|
|
325bdddc38 | ||
|
|
c06a4b9df2 | ||
|
|
830fa866a5 | ||
|
|
063e4a2914 | ||
|
|
e3e9fee419 | ||
|
|
4b073bc39a | ||
|
|
930773a1ed | ||
|
|
87c6c5224d | ||
|
|
c969b2b02b | ||
|
|
18c7f46c12 | ||
|
|
92e91bd3a1 | ||
|
|
16e069b8bb | ||
|
|
a6cc74b987 | ||
|
|
042572177b | ||
|
|
71f9cbcfc8 | ||
|
|
297b2a12d6 | ||
|
|
b19f8b1607 | ||
|
|
77a31eb3c5 | ||
|
|
7bb35db872 | ||
|
|
690fcd793b | ||
|
|
fd85664ae6 | ||
|
|
1b048e966a | ||
|
|
b5164f55a0 | ||
|
|
96797e43b4 | ||
|
|
033dab9ca0 | ||
|
|
6e336f6e5f | ||
|
|
7f3e07f1c8 | ||
|
|
5b55bcc564 | ||
|
|
e3f7991f99 | ||
|
|
e4ac882007 | ||
|
|
e90d4f0a03 | ||
|
|
451211cb01 | ||
|
|
ea3d324f13 | ||
|
|
7e4740156f | ||
|
|
ef45540927 | ||
|
|
c69cc4ce1f | ||
|
|
25085a6ac2 | ||
|
|
23d0f73838 | ||
|
|
7d72ebaa5c | ||
|
|
05fe0f20a6 | ||
|
|
1afbd88e81 | ||
|
|
bdfdd7d993 | ||
|
|
3e04114f3d | ||
|
|
106a8a1536 | ||
|
|
3d791194f8 | ||
|
|
090aa88b5a | ||
|
|
f44db1487d | ||
|
|
8d681b36c7 | ||
|
|
5319e83843 | ||
|
|
22c36dd464 | ||
|
|
037a50ed36 | ||
|
|
3fc29ae3ee | ||
|
|
81a6739533 | ||
|
|
65daa41378 | ||
|
|
ce6678fdc9 | ||
|
|
cb957c302a | ||
|
|
0dfede2e79 | ||
|
|
da1bda0fb2 | ||
|
|
302e425453 | ||
|
|
0dc69c70f7 | ||
|
|
60ada7edb4 | ||
|
|
f993ed5c86 | ||
|
|
6d2b79e3a2 | ||
|
|
a07bb84215 | ||
|
|
3386ca7496 | ||
|
|
7eec1f31b5 | ||
|
|
8dd3bef7ef | ||
|
|
8c7751e1c2 | ||
|
|
d6197b621d | ||
|
|
e736691a6d | ||
|
|
ce050afaf3 | ||
|
|
248f6ad771 | ||
|
|
fed14a3e94 | ||
|
|
e93f2fdb83 | ||
|
|
2ca24b0075 | ||
|
|
b24373fec2 | ||
|
|
6f424a768e | ||
|
|
6a3ff0b617 | ||
|
|
98431cde07 | ||
|
|
19c157afe2 | ||
|
|
110a2ddc9b | ||
|
|
efccbc9fb5 | ||
|
|
88e6951465 | ||
|
|
45ed7effed | ||
|
|
2e93fcf893 | ||
|
|
3a986eac57 | ||
|
|
ed7786869a | ||
|
|
5adb43b8be | ||
|
|
20946cdd3b | ||
|
|
dc008cc17d | ||
|
|
cc431ad50a | ||
|
|
a75965fa94 | ||
|
|
2f28f9072e | ||
|
|
44354ee7bf | ||
|
|
62aa642d71 | ||
|
|
541eb78994 | ||
|
|
7c1fdf02cd | ||
|
|
f451b404ea | ||
|
|
9976ff8c79 | ||
|
|
9ef37860cd | ||
|
|
2580e7d63e | ||
|
|
f037cf80c9 | ||
|
|
e357b9efe0 | ||
|
|
884c2ddc48 | ||
|
|
adbad6092f | ||
|
|
aa6108382e | ||
|
|
4b6e4e1813 | ||
|
|
a921444fdb | ||
|
|
2678aefc48 | ||
|
|
ff19cdafdb | ||
|
|
fba5c354ad | ||
|
|
51aad61c8c | ||
|
|
825bb86044 | ||
|
|
14011f037d | ||
|
|
5afa3b9150 | ||
|
|
3197172405 | ||
|
|
3007081a87 | ||
|
|
7954e67bb8 | ||
|
|
a48d42a804 | ||
|
|
8cc525e82b | ||
|
|
d4d6c58e37 | ||
|
|
d2b40894be | ||
|
|
0a075cb39c | ||
|
|
3ef9c0ba03 | ||
|
|
f5a15f270a | ||
|
|
c5372cf077 | ||
|
|
e616012d69 | ||
|
|
fbbd3f0d8d | ||
|
|
ee78c541a4 | ||
|
|
892ebd9760 | ||
|
|
571f00cb95 | ||
|
|
0266e70c52 | ||
|
|
3c41d7358c | ||
|
|
89771f2c2c | ||
|
|
1d3f1983b2 | ||
|
|
f7c2d2a3de | ||
|
|
aa81375d73 | ||
|
|
e25d6252a4 | ||
|
|
f9a3d08f1b | ||
|
|
e85607410e | ||
|
|
f5a6079141 | ||
|
|
1b833d63d9 | ||
|
|
fe96dbda15 | ||
|
|
d11c97e8e2 | ||
|
|
0462263765 | ||
|
|
5fc2d6cb9f | ||
|
|
c8f6ed77b9 | ||
|
|
291033032e | ||
|
|
3a6a81fc9a | ||
|
|
b458f42966 | ||
|
|
0a64929f19 | ||
|
|
c02a4beed8 | ||
|
|
6b350766bd | ||
|
|
63a37d16f3 | ||
|
|
f14c323b4c | ||
|
|
b8f83282f8 | ||
|
|
be013eb396 | ||
|
|
74bb78df82 | ||
|
|
c125f4a594 | ||
|
|
078dff72ca | ||
|
|
6b4463dc1f | ||
|
|
feb7e9a372 | ||
|
|
d004c64013 | ||
|
|
4736e5f9d1 | ||
|
|
43ae6c1e22 | ||
|
|
0afe5e405d | ||
|
|
91038e0eb6 | ||
|
|
b2d690187e | ||
|
|
d34b49d7b9 | ||
|
|
10b4753179 | ||
|
|
f51f67602e | ||
|
|
32ac7cabdf | ||
|
|
7f2628152a | ||
|
|
8cbd4f8701 | ||
|
|
71aacf39c7 | ||
|
|
7676defca9 | ||
|
|
b2fb48cfcf | ||
|
|
7c184c6e1f | ||
|
|
b9ec759bc2 | ||
|
|
913081ab02 | ||
|
|
ca88d22f39 | ||
|
|
1c0a9f36f1 | ||
|
|
cfd5e0221c | ||
|
|
168eea5d60 | ||
|
|
922751e059 | ||
|
|
723e357ead | ||
|
|
0c6caf187c | ||
|
|
b34705f64f | ||
|
|
efad6feb9a | ||
|
|
9e5f355daf | ||
|
|
003a9d20ad | ||
|
|
dd07443f72 | ||
|
|
7991241a50 | ||
|
|
b582cf0ea9 | ||
|
|
67611119b5 | ||
|
|
3b2e2f2f77 | ||
|
|
79436fadfb | ||
|
|
7ed1d54ab4 | ||
|
|
26daa30da4 | ||
|
|
7738bcb350 | ||
|
|
73b11ec876 | ||
|
|
d4596485be | ||
|
|
57adf3d573 | ||
|
|
74a61b5ab9 | ||
|
|
692fe74deb | ||
|
|
bdeb5febe4 | ||
|
|
9eeb1f2fc3 | ||
|
|
424982bc41 | ||
|
|
5e9c5dfdf0 | ||
|
|
aa1c813c43 | ||
|
|
6e4b73125b | ||
|
|
310073868e | ||
|
|
cc1524aa90 | ||
|
|
6c6eb16bb9 | ||
|
|
3b2c3cb366 | ||
|
|
55f3740d76 | ||
|
|
83db21b2fd | ||
|
|
d9c417cb49 | ||
|
|
b2597527a5 | ||
|
|
8496390e73 | ||
|
|
ac27b062b0 | ||
|
|
527a9fea76 | ||
|
|
1517fb74fd | ||
|
|
5c29ee726e | ||
|
|
50d3629c61 | ||
|
|
67edc5e83b |
@@ -1,11 +1,10 @@
|
||||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: Google
|
||||
# Generated with clang-format 3.8.1
|
||||
# Generated with clang-format 3.7.1
|
||||
AccessModifierOffset: -1
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignAfterOpenBracket: true
|
||||
AlignConsecutiveAssignments: false
|
||||
AlignConsecutiveDeclarations: false
|
||||
AlignEscapedNewlinesLeft: true
|
||||
AlignOperands: true
|
||||
AlignTrailingComments: true
|
||||
@@ -16,23 +15,10 @@ AllowShortFunctionsOnASingleLine: All
|
||||
AllowShortIfStatementsOnASingleLine: true
|
||||
AllowShortLoopsOnASingleLine: true
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: true
|
||||
AlwaysBreakTemplateDeclarations: true
|
||||
BinPackArguments: true
|
||||
BinPackParameters: true
|
||||
BraceWrapping:
|
||||
AfterClass: false
|
||||
AfterControlStatement: false
|
||||
AfterEnum: false
|
||||
AfterFunction: false
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
IndentBraces: false
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: Attach
|
||||
BreakBeforeTernaryOperators: true
|
||||
@@ -47,13 +33,6 @@ DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
|
||||
IncludeCategories:
|
||||
- Regex: '^<.*\.h>'
|
||||
Priority: 1
|
||||
- Regex: '^<.*'
|
||||
Priority: 2
|
||||
- Regex: '.*'
|
||||
Priority: 3
|
||||
IndentCaseLabels: true
|
||||
IndentWidth: 2
|
||||
IndentWrappedFunctionNames: false
|
||||
@@ -72,8 +51,6 @@ PenaltyBreakString: 1000
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Right
|
||||
ReflowComments: true
|
||||
SortIncludes: false
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
|
||||
41
.gitignore
vendored
41
.gitignore
vendored
@@ -29,36 +29,37 @@
|
||||
/examples/decode_with_drops
|
||||
/examples/decode_with_partial_drops
|
||||
/examples/example_xma
|
||||
/examples/lossless_encoder
|
||||
/examples/postproc
|
||||
/examples/resize_util
|
||||
/examples/set_maps
|
||||
/examples/simple_decoder
|
||||
/examples/simple_encoder
|
||||
/examples/twopass_encoder
|
||||
/examples/aom_cx_set_ref
|
||||
/examples/av1_spatial_scalable_encoder
|
||||
/examples/aom_temporal_scalable_patterns
|
||||
/examples/aom_temporal_svc_encoder
|
||||
/examples/vp8_multi_resolution_encoder
|
||||
/examples/vp8cx_set_ref
|
||||
/examples/vp9_lossless_encoder
|
||||
/examples/vp9_spatial_scalable_encoder
|
||||
/examples/vpx_temporal_scalable_patterns
|
||||
/examples/vpx_temporal_svc_encoder
|
||||
/ivfdec
|
||||
/ivfdec.dox
|
||||
/ivfenc
|
||||
/ivfenc.dox
|
||||
/libaom.so*
|
||||
/libaom.ver
|
||||
/libvpx.so*
|
||||
/libvpx.ver
|
||||
/samples.dox
|
||||
/test_intra_pred_speed
|
||||
/test_libaom
|
||||
/aom_api1_migration.dox
|
||||
/av1_rtcd.h
|
||||
/aom.pc
|
||||
/aom_config.c
|
||||
/aom_config.h
|
||||
/aom_dsp_rtcd.h
|
||||
/aom_scale_rtcd.h
|
||||
/aom_version.h
|
||||
/aomdec
|
||||
/aomdec.dox
|
||||
/aomenc
|
||||
/aomenc.dox
|
||||
/test_libvpx
|
||||
/vp8_api1_migration.dox
|
||||
/vp[89x]_rtcd.h
|
||||
/vpx.pc
|
||||
/vpx_config.c
|
||||
/vpx_config.h
|
||||
/vpx_dsp_rtcd.h
|
||||
/vpx_scale_rtcd.h
|
||||
/vpx_version.h
|
||||
/vpxdec
|
||||
/vpxdec.dox
|
||||
/vpxenc
|
||||
/vpxenc.dox
|
||||
TAGS
|
||||
|
||||
7
.mailmap
7
.mailmap
@@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com>
|
||||
Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
|
||||
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
|
||||
Alpha Lam <hclam@google.com> <hclam@chromium.org>
|
||||
Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
|
||||
Deb Mukherjee <debargha@google.com>
|
||||
Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
|
||||
Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
|
||||
@@ -13,12 +14,15 @@ Jim Bankoski <jimbankoski@google.com>
|
||||
Johann Koenig <johannkoenig@google.com>
|
||||
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
|
||||
Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
|
||||
Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
|
||||
John Koleszar <jkoleszar@google.com>
|
||||
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
|
||||
Marco Paniconi <marpan@google.com>
|
||||
Marco Paniconi <marpan@google.com> <marpan@chromium.org>
|
||||
Pascal Massimino <pascal.massimino@gmail.com>
|
||||
Paul Wilkins <paulwilkins@google.com>
|
||||
Peter de Rivaz <peter.derivaz@gmail.com>
|
||||
Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
|
||||
Ralph Giles <giles@xiph.org> <giles@entropywave.com>
|
||||
Ralph Giles <giles@xiph.org> <giles@mozilla.com>
|
||||
Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
|
||||
@@ -26,7 +30,8 @@ Sami Pietilä <samipietila@google.com>
|
||||
Tamar Levy <tamar.levy@intel.com>
|
||||
Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
|
||||
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
|
||||
Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
|
||||
Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
|
||||
Tom Finegan <tomfinegan@google.com>
|
||||
Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
|
||||
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
|
||||
Yaowu Xu <yaowu@google.com> <Yaowu Xu>
|
||||
|
||||
18
AUTHORS
18
AUTHORS
@@ -24,6 +24,7 @@ changjun.yang <changjun.yang@intel.com>
|
||||
Charles 'Buck' Krasic <ckrasic@google.com>
|
||||
chm <chm@rock-chips.com>
|
||||
Christian Duvivier <cduvivier@google.com>
|
||||
Daniele Castagna <dcastagna@chromium.org>
|
||||
Daniel Kang <ddkang@google.com>
|
||||
Deb Mukherjee <debargha@google.com>
|
||||
Dim Temp <dimtemp0@gmail.com>
|
||||
@@ -56,7 +57,7 @@ James Zern <jzern@google.com>
|
||||
Jan Gerber <j@mailb.org>
|
||||
Jan Kratochvil <jan.kratochvil@redhat.com>
|
||||
Janne Salonen <jsalonen@google.com>
|
||||
Jean-Marc Valin <jmvalin@jmvalin.ca>
|
||||
Jean-Yves Avenard <jyavenard@mozilla.com>
|
||||
Jeff Faust <jfaust@google.com>
|
||||
Jeff Muizelaar <jmuizelaar@mozilla.com>
|
||||
Jeff Petkau <jpet@chromium.org>
|
||||
@@ -65,7 +66,6 @@ Jian Zhou <zhoujian@google.com>
|
||||
Jim Bankoski <jimbankoski@google.com>
|
||||
Jingning Han <jingning@google.com>
|
||||
Joey Parrish <joeyparrish@google.com>
|
||||
Johann Koenig <johannkoenig@chromium.org>
|
||||
Johann Koenig <johannkoenig@google.com>
|
||||
John Koleszar <jkoleszar@google.com>
|
||||
Johnny Klonaris <google@jawknee.com>
|
||||
@@ -77,6 +77,7 @@ Justin Clift <justin@salasaga.org>
|
||||
Justin Lebar <justin.lebar@gmail.com>
|
||||
KO Myung-Hun <komh@chollian.net>
|
||||
Lawrence Velázquez <larryv@macports.org>
|
||||
Linfeng Zhang <linfengz@google.com>
|
||||
Lou Quillio <louquillio@google.com>
|
||||
Luca Barbato <lu_zero@gentoo.org>
|
||||
Makoto Kato <makoto.kt@gmail.com>
|
||||
@@ -92,7 +93,6 @@ Mike Hommey <mhommey@mozilla.com>
|
||||
Mikhal Shemer <mikhal@google.com>
|
||||
Minghai Shang <minghai@google.com>
|
||||
Morton Jonuschat <yabawock@gmail.com>
|
||||
Nathan E. Egge <negge@dgql.org>
|
||||
Nico Weber <thakis@chromium.org>
|
||||
Parag Salasakar <img.mips1@gmail.com>
|
||||
Pascal Massimino <pascal.massimino@gmail.com>
|
||||
@@ -101,7 +101,6 @@ Paul Wilkins <paulwilkins@google.com>
|
||||
Pavol Rusnak <stick@gk2.sk>
|
||||
Paweł Hajdan <phajdan@google.com>
|
||||
Pengchong Jin <pengchong@google.com>
|
||||
Peter de Rivaz <peter.derivaz@argondesign.com>
|
||||
Peter de Rivaz <peter.derivaz@gmail.com>
|
||||
Philip Jägenstedt <philipj@opera.com>
|
||||
Priit Laes <plaes@plaes.org>
|
||||
@@ -121,7 +120,6 @@ Sergey Ulanov <sergeyu@chromium.org>
|
||||
Shimon Doodkin <helpmepro1@gmail.com>
|
||||
Shunyao Li <shunyaoli@google.com>
|
||||
Stefan Holmer <holmer@google.com>
|
||||
Steinar Midtskogen <stemidts@cisco.com>
|
||||
Suman Sunkara <sunkaras@google.com>
|
||||
Taekhyun Kim <takim@nvidia.com>
|
||||
Takanori MATSUURA <t.matsuu@gmail.com>
|
||||
@@ -129,16 +127,16 @@ Tamar Levy <tamar.levy@intel.com>
|
||||
Tao Bai <michaelbai@chromium.org>
|
||||
Tero Rintaluoma <teror@google.com>
|
||||
Thijs Vermeir <thijsvermeir@gmail.com>
|
||||
Thomas Daede <tdaede@mozilla.com>
|
||||
Thomas Davies <thdavies@cisco.com>
|
||||
Thomas <thdavies@cisco.com>
|
||||
Tim Kopp <tkopp@google.com>
|
||||
Timothy B. Terriberry <tterribe@xiph.org>
|
||||
Tom Finegan <tomfinegan@google.com>
|
||||
Tristan Matthews <le.businessman@gmail.com>
|
||||
Tristan Matthews <tmatth@videolan.org>
|
||||
Vignesh Venkatasubramanian <vigneshv@google.com>
|
||||
Yaowu Xu <yaowu@google.com>
|
||||
Yi Luo <luoyi@google.com>
|
||||
Yongzhe Wang <yongzhe@google.com>
|
||||
Yunqing Wang <yunqingwang@google.com>
|
||||
Yury Gitman <yuryg@google.com>
|
||||
Zoe Liu <zoeliu@google.com>
|
||||
Google Inc.
|
||||
The Mozilla Foundation
|
||||
The Xiph.Org Foundation
|
||||
|
||||
34
CHANGELOG
34
CHANGELOG
@@ -1,9 +1,33 @@
|
||||
Next Release
|
||||
- Incompatible changes:
|
||||
The AV1 encoder's default keyframe interval changed to 128 from 9999.
|
||||
2016-07-20 v1.6.0 "Khaki Campbell Duck"
|
||||
This release improves upon the VP9 encoder and speeds up the encoding and
|
||||
decoding processes.
|
||||
|
||||
- Upgrading:
|
||||
This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum
|
||||
in vpx_image and some minor changes to the VP8_COMP structure.
|
||||
|
||||
The default key frame interval for VP9 has changed from 128 to 9999.
|
||||
|
||||
- Enhancement:
|
||||
A core focus has been performance for low end Intel processors. SSSE3
|
||||
instructions such as 'pshufb' have been avoided and instructions have been
|
||||
reordered to better accommodate the more constrained pipelines.
|
||||
|
||||
As a result, devices based on Celeron processors have seen substantial
|
||||
decoding improvements. From Indian Runner Duck to Javan Whistling Duck,
|
||||
decoding speed improved between 10 and 30%. Between Javan Whistling Duck
|
||||
and Khaki Campbell Duck, it improved another 10 to 15%.
|
||||
|
||||
While Celeron benefited most, Core-i5 also improved 5% and 10% between the
|
||||
respective releases.
|
||||
|
||||
Realtime performance for WebRTC for both speed and quality has received a
|
||||
lot of attention.
|
||||
|
||||
- Bug Fixes:
|
||||
A number of fuzzing issues, found variously by Mozilla, Chromium and others,
|
||||
have been fixed and we strongly recommend updating.
|
||||
|
||||
2016-04-07 v0.1.0 "AOMedia Codec 1"
|
||||
This release is the first Alliance for Open Media codec.
|
||||
2015-11-09 v1.5.0 "Javan Whistling Duck"
|
||||
This release improves upon the VP9 encoder and speeds up the encoding and
|
||||
decoding processes.
|
||||
|
||||
270
CMakeLists.txt
270
CMakeLists.txt
@@ -1,270 +0,0 @@
|
||||
##
|
||||
## Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
##
|
||||
## This source code is subject to the terms of the BSD 2 Clause License and
|
||||
## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
## was not distributed with this source code in the LICENSE file, you can
|
||||
## obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
## Media Patent License 1.0 was not distributed with this source code in the
|
||||
## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
##
|
||||
cmake_minimum_required(VERSION 3.2)
|
||||
project(AOM C CXX)
|
||||
|
||||
set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
|
||||
include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
|
||||
|
||||
set(AOM_SRCS
|
||||
"${AOM_CONFIG_DIR}/aom_config.c"
|
||||
"${AOM_CONFIG_DIR}/aom_config.h"
|
||||
"${AOM_ROOT}/aom/aom.h"
|
||||
"${AOM_ROOT}/aom/aom_codec.h"
|
||||
"${AOM_ROOT}/aom/aom_decoder.h"
|
||||
"${AOM_ROOT}/aom/aom_encoder.h"
|
||||
"${AOM_ROOT}/aom/aom_frame_buffer.h"
|
||||
"${AOM_ROOT}/aom/aom_image.h"
|
||||
"${AOM_ROOT}/aom/aom_integer.h"
|
||||
"${AOM_ROOT}/aom/aomcx.h"
|
||||
"${AOM_ROOT}/aom/aomdx.h"
|
||||
"${AOM_ROOT}/aom/internal/aom_codec_internal.h"
|
||||
"${AOM_ROOT}/aom/src/aom_codec.c"
|
||||
"${AOM_ROOT}/aom/src/aom_decoder.c"
|
||||
"${AOM_ROOT}/aom/src/aom_encoder.c"
|
||||
"${AOM_ROOT}/aom/src/aom_image.c")
|
||||
|
||||
set(AOM_DSP_SRCS
|
||||
"${AOM_ROOT}/aom_dsp/aom_convolve.c"
|
||||
"${AOM_ROOT}/aom_dsp/aom_convolve.h"
|
||||
"${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
|
||||
"${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
|
||||
"${AOM_ROOT}/aom_dsp/aom_filter.h"
|
||||
"${AOM_ROOT}/aom_dsp/aom_simd.c"
|
||||
"${AOM_ROOT}/aom_dsp/aom_simd.h"
|
||||
"${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
|
||||
"${AOM_ROOT}/aom_dsp/avg.c"
|
||||
"${AOM_ROOT}/aom_dsp/bitreader.h"
|
||||
"${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
|
||||
"${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
|
||||
"${AOM_ROOT}/aom_dsp/bitwriter.h"
|
||||
"${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
|
||||
"${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
|
||||
"${AOM_ROOT}/aom_dsp/blend.h"
|
||||
"${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
|
||||
"${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
|
||||
"${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
|
||||
"${AOM_ROOT}/aom_dsp/dkboolreader.c"
|
||||
"${AOM_ROOT}/aom_dsp/dkboolreader.h"
|
||||
"${AOM_ROOT}/aom_dsp/dkboolwriter.c"
|
||||
"${AOM_ROOT}/aom_dsp/dkboolwriter.h"
|
||||
"${AOM_ROOT}/aom_dsp/fwd_txfm.c"
|
||||
"${AOM_ROOT}/aom_dsp/fwd_txfm.h"
|
||||
"${AOM_ROOT}/aom_dsp/intrapred.c"
|
||||
"${AOM_ROOT}/aom_dsp/inv_txfm.c"
|
||||
"${AOM_ROOT}/aom_dsp/inv_txfm.h"
|
||||
"${AOM_ROOT}/aom_dsp/loopfilter.c"
|
||||
"${AOM_ROOT}/aom_dsp/prob.c"
|
||||
"${AOM_ROOT}/aom_dsp/prob.h"
|
||||
"${AOM_ROOT}/aom_dsp/psnr.c"
|
||||
"${AOM_ROOT}/aom_dsp/psnr.h"
|
||||
"${AOM_ROOT}/aom_dsp/quantize.c"
|
||||
"${AOM_ROOT}/aom_dsp/quantize.h"
|
||||
"${AOM_ROOT}/aom_dsp/sad.c"
|
||||
"${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
|
||||
"${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
|
||||
"${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
|
||||
"${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
|
||||
"${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
|
||||
"${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
|
||||
"${AOM_ROOT}/aom_dsp/subtract.c"
|
||||
"${AOM_ROOT}/aom_dsp/txfm_common.h"
|
||||
"${AOM_ROOT}/aom_dsp/variance.c"
|
||||
"${AOM_ROOT}/aom_dsp/variance.h")
|
||||
|
||||
set(AOM_MEM_SRCS
|
||||
"${AOM_ROOT}/aom_mem/aom_mem.c"
|
||||
"${AOM_ROOT}/aom_mem/aom_mem.h"
|
||||
"${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
|
||||
|
||||
set(AOM_SCALE_SRCS
|
||||
"${AOM_ROOT}/aom_scale/aom_scale.h"
|
||||
"${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
|
||||
"${AOM_ROOT}/aom_scale/generic/aom_scale.c"
|
||||
"${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
|
||||
"${AOM_ROOT}/aom_scale/generic/yv12config.c"
|
||||
"${AOM_ROOT}/aom_scale/generic/yv12extend.c"
|
||||
"${AOM_ROOT}/aom_scale/yv12config.h")
|
||||
|
||||
# TODO(tomfinegan): Extract aom_ports from aom_util if possible.
|
||||
set(AOM_UTIL_SRCS
|
||||
"${AOM_ROOT}/aom_ports/aom_once.h"
|
||||
"${AOM_ROOT}/aom_ports/aom_timer.h"
|
||||
"${AOM_ROOT}/aom_ports/bitops.h"
|
||||
"${AOM_ROOT}/aom_ports/emmintrin_compat.h"
|
||||
"${AOM_ROOT}/aom_ports/mem.h"
|
||||
"${AOM_ROOT}/aom_ports/mem_ops.h"
|
||||
"${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
|
||||
"${AOM_ROOT}/aom_ports/msvc.h"
|
||||
"${AOM_ROOT}/aom_ports/system_state.h"
|
||||
"${AOM_ROOT}/aom_util/aom_thread.c"
|
||||
"${AOM_ROOT}/aom_util/aom_thread.h"
|
||||
"${AOM_ROOT}/aom_util/endian_inl.h")
|
||||
|
||||
set(AOM_AV1_COMMON_SRCS
|
||||
"${AOM_ROOT}/av1/av1_iface_common.h"
|
||||
"${AOM_ROOT}/av1/common/alloccommon.c"
|
||||
"${AOM_ROOT}/av1/common/alloccommon.h"
|
||||
"${AOM_ROOT}/av1/common/av1_fwd_txfm.c"
|
||||
"${AOM_ROOT}/av1/common/av1_fwd_txfm.h"
|
||||
"${AOM_ROOT}/av1/common/av1_inv_txfm.c"
|
||||
"${AOM_ROOT}/av1/common/av1_inv_txfm.h"
|
||||
"${AOM_ROOT}/av1/common/av1_rtcd.c"
|
||||
"${AOM_ROOT}/av1/common/blockd.c"
|
||||
"${AOM_ROOT}/av1/common/blockd.h"
|
||||
"${AOM_ROOT}/av1/common/common.h"
|
||||
"${AOM_ROOT}/av1/common/common_data.h"
|
||||
"${AOM_ROOT}/av1/common/convolve.c"
|
||||
"${AOM_ROOT}/av1/common/convolve.h"
|
||||
"${AOM_ROOT}/av1/common/debugmodes.c"
|
||||
"${AOM_ROOT}/av1/common/entropy.c"
|
||||
"${AOM_ROOT}/av1/common/entropy.h"
|
||||
"${AOM_ROOT}/av1/common/entropymode.c"
|
||||
"${AOM_ROOT}/av1/common/entropymode.h"
|
||||
"${AOM_ROOT}/av1/common/entropymv.c"
|
||||
"${AOM_ROOT}/av1/common/entropymv.h"
|
||||
"${AOM_ROOT}/av1/common/enums.h"
|
||||
"${AOM_ROOT}/av1/common/filter.c"
|
||||
"${AOM_ROOT}/av1/common/filter.h"
|
||||
"${AOM_ROOT}/av1/common/frame_buffers.c"
|
||||
"${AOM_ROOT}/av1/common/frame_buffers.h"
|
||||
"${AOM_ROOT}/av1/common/idct.c"
|
||||
"${AOM_ROOT}/av1/common/idct.h"
|
||||
"${AOM_ROOT}/av1/common/loopfilter.c"
|
||||
"${AOM_ROOT}/av1/common/loopfilter.h"
|
||||
"${AOM_ROOT}/av1/common/mv.h"
|
||||
"${AOM_ROOT}/av1/common/mvref_common.c"
|
||||
"${AOM_ROOT}/av1/common/mvref_common.h"
|
||||
"${AOM_ROOT}/av1/common/odintrin.c"
|
||||
"${AOM_ROOT}/av1/common/odintrin.h"
|
||||
"${AOM_ROOT}/av1/common/onyxc_int.h"
|
||||
"${AOM_ROOT}/av1/common/pred_common.c"
|
||||
"${AOM_ROOT}/av1/common/pred_common.h"
|
||||
"${AOM_ROOT}/av1/common/quant_common.c"
|
||||
"${AOM_ROOT}/av1/common/quant_common.h"
|
||||
"${AOM_ROOT}/av1/common/reconinter.c"
|
||||
"${AOM_ROOT}/av1/common/reconinter.h"
|
||||
"${AOM_ROOT}/av1/common/reconintra.c"
|
||||
"${AOM_ROOT}/av1/common/reconintra.h"
|
||||
"${AOM_ROOT}/av1/common/scale.c"
|
||||
"${AOM_ROOT}/av1/common/scale.h"
|
||||
"${AOM_ROOT}/av1/common/scan.c"
|
||||
"${AOM_ROOT}/av1/common/scan.h"
|
||||
"${AOM_ROOT}/av1/common/seg_common.c"
|
||||
"${AOM_ROOT}/av1/common/seg_common.h"
|
||||
"${AOM_ROOT}/av1/common/thread_common.c"
|
||||
"${AOM_ROOT}/av1/common/thread_common.h"
|
||||
"${AOM_ROOT}/av1/common/tile_common.c"
|
||||
"${AOM_ROOT}/av1/common/tile_common.h")
|
||||
|
||||
set(AOM_AV1_DECODER_SRCS
|
||||
"${AOM_ROOT}/av1/av1_dx_iface.c"
|
||||
"${AOM_ROOT}/av1/decoder/decodeframe.c"
|
||||
"${AOM_ROOT}/av1/decoder/decodeframe.h"
|
||||
"${AOM_ROOT}/av1/decoder/decodemv.c"
|
||||
"${AOM_ROOT}/av1/decoder/decodemv.h"
|
||||
"${AOM_ROOT}/av1/decoder/decoder.c"
|
||||
"${AOM_ROOT}/av1/decoder/decoder.h"
|
||||
"${AOM_ROOT}/av1/decoder/detokenize.c"
|
||||
"${AOM_ROOT}/av1/decoder/detokenize.h"
|
||||
"${AOM_ROOT}/av1/decoder/dsubexp.c"
|
||||
"${AOM_ROOT}/av1/decoder/dsubexp.h"
|
||||
"${AOM_ROOT}/av1/decoder/dthread.c"
|
||||
"${AOM_ROOT}/av1/decoder/dthread.h")
|
||||
|
||||
set(AOM_AV1_ENCODER_SRCS
|
||||
"${AOM_ROOT}/av1/av1_cx_iface.c"
|
||||
"${AOM_ROOT}/av1/encoder/aq_complexity.c"
|
||||
"${AOM_ROOT}/av1/encoder/aq_complexity.h"
|
||||
"${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
|
||||
"${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
|
||||
"${AOM_ROOT}/av1/encoder/aq_variance.c"
|
||||
"${AOM_ROOT}/av1/encoder/aq_variance.h"
|
||||
"${AOM_ROOT}/av1/encoder/bitstream.c"
|
||||
"${AOM_ROOT}/av1/encoder/bitstream.h"
|
||||
"${AOM_ROOT}/av1/encoder/block.h"
|
||||
"${AOM_ROOT}/av1/encoder/context_tree.c"
|
||||
"${AOM_ROOT}/av1/encoder/context_tree.h"
|
||||
"${AOM_ROOT}/av1/encoder/cost.c"
|
||||
"${AOM_ROOT}/av1/encoder/cost.h"
|
||||
"${AOM_ROOT}/av1/encoder/dct.c"
|
||||
"${AOM_ROOT}/av1/encoder/encodeframe.c"
|
||||
"${AOM_ROOT}/av1/encoder/encodeframe.h"
|
||||
"${AOM_ROOT}/av1/encoder/encodemb.c"
|
||||
"${AOM_ROOT}/av1/encoder/encodemb.h"
|
||||
"${AOM_ROOT}/av1/encoder/encodemv.c"
|
||||
"${AOM_ROOT}/av1/encoder/encodemv.h"
|
||||
"${AOM_ROOT}/av1/encoder/encoder.c"
|
||||
"${AOM_ROOT}/av1/encoder/encoder.h"
|
||||
"${AOM_ROOT}/av1/encoder/ethread.c"
|
||||
"${AOM_ROOT}/av1/encoder/ethread.h"
|
||||
"${AOM_ROOT}/av1/encoder/extend.c"
|
||||
"${AOM_ROOT}/av1/encoder/extend.h"
|
||||
"${AOM_ROOT}/av1/encoder/firstpass.c"
|
||||
"${AOM_ROOT}/av1/encoder/firstpass.h"
|
||||
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
|
||||
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
|
||||
"${AOM_ROOT}/av1/encoder/lookahead.c"
|
||||
"${AOM_ROOT}/av1/encoder/lookahead.h"
|
||||
"${AOM_ROOT}/av1/encoder/mbgraph.c"
|
||||
"${AOM_ROOT}/av1/encoder/mbgraph.h"
|
||||
"${AOM_ROOT}/av1/encoder/mcomp.c"
|
||||
"${AOM_ROOT}/av1/encoder/mcomp.h"
|
||||
"${AOM_ROOT}/av1/encoder/picklpf.c"
|
||||
"${AOM_ROOT}/av1/encoder/picklpf.h"
|
||||
"${AOM_ROOT}/av1/encoder/quantize.c"
|
||||
"${AOM_ROOT}/av1/encoder/quantize.h"
|
||||
"${AOM_ROOT}/av1/encoder/ratectrl.c"
|
||||
"${AOM_ROOT}/av1/encoder/ratectrl.h"
|
||||
"${AOM_ROOT}/av1/encoder/rd.c"
|
||||
"${AOM_ROOT}/av1/encoder/rd.h"
|
||||
"${AOM_ROOT}/av1/encoder/rdopt.c"
|
||||
"${AOM_ROOT}/av1/encoder/rdopt.h"
|
||||
"${AOM_ROOT}/av1/encoder/resize.c"
|
||||
"${AOM_ROOT}/av1/encoder/resize.h"
|
||||
"${AOM_ROOT}/av1/encoder/segmentation.c"
|
||||
"${AOM_ROOT}/av1/encoder/segmentation.h"
|
||||
"${AOM_ROOT}/av1/encoder/speed_features.c"
|
||||
"${AOM_ROOT}/av1/encoder/speed_features.h"
|
||||
"${AOM_ROOT}/av1/encoder/subexp.c"
|
||||
"${AOM_ROOT}/av1/encoder/subexp.h"
|
||||
"${AOM_ROOT}/av1/encoder/temporal_filter.c"
|
||||
"${AOM_ROOT}/av1/encoder/temporal_filter.h"
|
||||
"${AOM_ROOT}/av1/encoder/tokenize.c"
|
||||
"${AOM_ROOT}/av1/encoder/tokenize.h"
|
||||
"${AOM_ROOT}/av1/encoder/treewriter.c"
|
||||
"${AOM_ROOT}/av1/encoder/treewriter.h")
|
||||
|
||||
# Targets
|
||||
add_library(aom_dsp ${AOM_DSP_SRCS})
|
||||
include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
|
||||
add_library(aom_mem ${AOM_MEM_SRCS})
|
||||
add_library(aom_scale ${AOM_SCALE_SRCS})
|
||||
include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR})
|
||||
add_library(aom_util ${AOM_UTIL_SRCS})
|
||||
add_library(aom_av1_decoder ${AOM_AV1_DECODER_SRCS})
|
||||
add_library(aom_av1_encoder ${AOM_AV1_ENCODER_SRCS})
|
||||
add_library(aom ${AOM_SRCS})
|
||||
target_link_libraries(aom LINK_PUBLIC
|
||||
aom_dsp
|
||||
aom_mem
|
||||
aom_scale
|
||||
aom_util
|
||||
aom_av1_decoder
|
||||
aom_av1_encoder)
|
||||
add_executable(simple_decoder examples/simple_decoder.c)
|
||||
include_directories(${AOM_ROOT})
|
||||
target_link_libraries(simple_decoder LINK_PUBLIC aom)
|
||||
add_executable(simple_encoder examples/simple_encoder.c)
|
||||
include_directories(${AOM_ROOT})
|
||||
target_link_libraries(simple_encoder LINK_PUBLIC aom)
|
||||
|
||||
42
LICENSE
42
LICENSE
@@ -1,27 +1,31 @@
|
||||
Copyright (c) 2016, Alliance for Open Media. All rights reserved.
|
||||
Copyright (c) 2010, The WebM Project authors. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
* Neither the name of Google, nor the WebM Project, nor the names
|
||||
of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
127
PATENTS
127
PATENTS
@@ -1,108 +1,23 @@
|
||||
Alliance for Open Media Patent License 1.0
|
||||
Additional IP Rights Grant (Patents)
|
||||
------------------------------------
|
||||
|
||||
1. License Terms.
|
||||
|
||||
1.1. Patent License. Subject to the terms and conditions of this License, each
|
||||
Licensor, on behalf of itself and successors in interest and assigns,
|
||||
grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
|
||||
no-charge, royalty-free, irrevocable (except as expressly stated in this
|
||||
License) patent license to its Necessary Claims to make, use, sell, offer
|
||||
for sale, import or distribute any Implementation.
|
||||
|
||||
1.2. Conditions.
|
||||
|
||||
1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
|
||||
sell, offer for sale, import or distribute an Implementation under
|
||||
Section 1.1, Licensee must make its Necessary Claims available under
|
||||
this License, and must reproduce this License with any Implementation
|
||||
as follows:
|
||||
|
||||
a. For distribution in source code, by including this License in the
|
||||
root directory of the source code with its Implementation.
|
||||
|
||||
b. For distribution in any other form (including binary, object form,
|
||||
and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
|
||||
GDSII, etc.)), by including this License in the documentation, legal
|
||||
notices, and/or other written materials provided with the
|
||||
Implementation.
|
||||
|
||||
1.2.2. Additional Conditions. This license is directly from Licensor to
|
||||
Licensee. Licensee acknowledges as a condition of benefiting from it
|
||||
that no rights from Licensor are received from suppliers, distributors,
|
||||
or otherwise in connection with this License.
|
||||
|
||||
1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
|
||||
initiates patent litigation or files, maintains, or voluntarily
|
||||
participates in a lawsuit against another entity or any person asserting
|
||||
that any Implementation infringes Necessary Claims, any patent licenses
|
||||
granted under this License directly to the Licensee are immediately
|
||||
terminated as of the date of the initiation of action unless 1) that suit
|
||||
was in response to a corresponding suit regarding an Implementation first
|
||||
brought against an initiating entity, or 2) that suit was brought to
|
||||
enforce the terms of this License (including intervention in a third-party
|
||||
action by a Licensee).
|
||||
|
||||
1.4. Disclaimers. The Reference Implementation and Specification are provided
|
||||
"AS IS" and without warranty. The entire risk as to implementing or
|
||||
otherwise using the Reference Implementation or Specification is assumed
|
||||
by the implementer and user. Licensor expressly disclaims any warranties
|
||||
(express, implied, or otherwise), including implied warranties of
|
||||
merchantability, non-infringement, fitness for a particular purpose, or
|
||||
title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
|
||||
ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
|
||||
INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
|
||||
ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
|
||||
OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
|
||||
NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
2. Definitions.
|
||||
|
||||
2.1. Affiliate. <20>Affiliate<74> means an entity that directly or indirectly
|
||||
Controls, is Controlled by, or is under common Control of that party.
|
||||
|
||||
2.2. Control. <20>Control<6F> means direct or indirect control of more than 50% of
|
||||
the voting power to elect directors of that corporation, or for any other
|
||||
entity, the power to direct management of such entity.
|
||||
|
||||
2.3. Decoder. "Decoder" means any decoder that conforms fully with all
|
||||
non-optional portions of the Specification.
|
||||
|
||||
2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can
|
||||
be decoded by a Decoder only to the extent it produces such a bitstream.
|
||||
|
||||
2.5. Final Deliverable. <20>Final Deliverable<6C> means the final version of a
|
||||
deliverable approved by the Alliance for Open Media as a Final
|
||||
Deliverable.
|
||||
|
||||
2.6. Implementation. "Implementation" means any implementation, including the
|
||||
Reference Implementation, that is an Encoder and/or a Decoder. An
|
||||
Implementation also includes components of an Implementation only to the
|
||||
extent they are used as part of an Implementation.
|
||||
|
||||
2.7. License. <20>License<73> means this license.
|
||||
|
||||
2.8. Licensee. <20>Licensee<65> means any person or entity who exercises patent
|
||||
rights granted under this License.
|
||||
|
||||
2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers
|
||||
for sale, imports or distributes any Implementation, or (ii) a person
|
||||
or entity that has a licensing obligation to the Implementation as a
|
||||
result of its membership and/or participation in the Alliance for Open
|
||||
Media working group that developed the Specification.
|
||||
|
||||
2.10. Necessary Claims. "Necessary Claims" means all claims of patents or
|
||||
patent applications, (a) that currently or at any time in the future,
|
||||
are owned or controlled by the Licensor, and (b) (i) would be an
|
||||
Essential Claim as defined by the W3C Policy as of February 5, 2004
|
||||
(https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
|
||||
as if the Specification was a W3C Recommendation; or (ii) are infringed
|
||||
by the Reference Implementation.
|
||||
|
||||
2.11. Reference Implementation. <20>Reference Implementation<6F> means an Encoder
|
||||
and/or Decoder released by the Alliance for Open Media as a Final
|
||||
Deliverable.
|
||||
|
||||
2.12. Specification. <20>Specification<6F> means the specification designated by
|
||||
the Alliance for Open Media as a Final Deliverable for which this
|
||||
License was issued.
|
||||
"These implementations" means the copyrightable works that implement the WebM
|
||||
codecs distributed by Google as part of the WebM Project.
|
||||
|
||||
Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
|
||||
royalty-free, irrevocable (except as stated in this section) patent license to
|
||||
make, have made, use, offer to sell, sell, import, transfer, and otherwise
|
||||
run, modify and propagate the contents of these implementations of WebM, where
|
||||
such license applies only to those patent claims, both currently owned by
|
||||
Google and acquired in the future, licensable by Google that are necessarily
|
||||
infringed by these implementations of WebM. This grant does not include claims
|
||||
that would be infringed only as a consequence of further modification of these
|
||||
implementations. If you or your agent or exclusive licensee institute or order
|
||||
or agree to the institution of patent litigation or any other patent
|
||||
enforcement activity against any entity (including a cross-claim or
|
||||
counterclaim in a lawsuit) alleging that any of these implementations of WebM
|
||||
or any code incorporated within any of these implementations of WebM
|
||||
constitute direct or contributory patent infringement, or inducement of
|
||||
patent infringement, then any patent rights granted to you under this License
|
||||
for these implementations of WebM shall terminate as of the date such
|
||||
litigation is filed.
|
||||
|
||||
25
README
25
README
@@ -1,6 +1,6 @@
|
||||
README - 23 March 2015
|
||||
README - 20 July 2016
|
||||
|
||||
Welcome to the WebM VP8/AV1 Codec SDK!
|
||||
Welcome to the WebM VP8/VP9 Codec SDK!
|
||||
|
||||
COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
The build system used is similar to autotools. Building generally consists of
|
||||
@@ -33,13 +33,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ ../libaom/configure <options>
|
||||
$ ../libvpx/configure <options>
|
||||
$ make
|
||||
|
||||
3. Configuration options
|
||||
The 'configure' script supports a number of options. The --help option can be
|
||||
used to get a list of supported options:
|
||||
$ ../libaom/configure --help
|
||||
$ ../libvpx/configure --help
|
||||
|
||||
4. Cross development
|
||||
For cross development, the most notable option is the --target option. The
|
||||
@@ -47,10 +47,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
--help output of the configure script. As of this writing, the list of
|
||||
available targets is:
|
||||
|
||||
armv6-linux-rvct
|
||||
armv6-linux-gcc
|
||||
armv6-none-rvct
|
||||
arm64-darwin-gcc
|
||||
arm64-linux-gcc
|
||||
armv7-android-gcc
|
||||
armv7-darwin-gcc
|
||||
armv7-linux-rvct
|
||||
@@ -60,6 +58,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
armv7-win32-vs12
|
||||
armv7-win32-vs14
|
||||
armv7s-darwin-gcc
|
||||
armv8-linux-gcc
|
||||
mips32-linux-gcc
|
||||
mips64-linux-gcc
|
||||
sparc-solaris-gcc
|
||||
@@ -73,6 +72,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
x86-darwin12-gcc
|
||||
x86-darwin13-gcc
|
||||
x86-darwin14-gcc
|
||||
x86-darwin15-gcc
|
||||
x86-iphonesimulator-gcc
|
||||
x86-linux-gcc
|
||||
x86-linux-icc
|
||||
@@ -90,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
x86_64-darwin12-gcc
|
||||
x86_64-darwin13-gcc
|
||||
x86_64-darwin14-gcc
|
||||
x86_64-darwin15-gcc
|
||||
x86_64-iphonesimulator-gcc
|
||||
x86_64-linux-gcc
|
||||
x86_64-linux-icc
|
||||
@@ -108,7 +109,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
toolchain, the following command could be used (note, POSIX SH syntax, adapt
|
||||
to your shell as necessary):
|
||||
|
||||
$ CROSS=mipsel-linux-uclibc- ../libaom/configure
|
||||
$ CROSS=mipsel-linux-uclibc- ../libvpx/configure
|
||||
|
||||
In addition, the executables to be invoked can be overridden by specifying the
|
||||
environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
|
||||
@@ -119,13 +120,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
This defaults to config.log. This should give a good indication of what went
|
||||
wrong. If not, contact us for support.
|
||||
|
||||
VP8/AV1 TEST VECTORS:
|
||||
VP8/VP9 TEST VECTORS:
|
||||
The test vectors can be downloaded and verified using the build system after
|
||||
running configure. To specify an alternate directory the
|
||||
LIBAOM_TEST_DATA_PATH environment variable can be used.
|
||||
LIBVPX_TEST_DATA_PATH environment variable can be used.
|
||||
|
||||
$ ./configure --enable-unit-tests
|
||||
$ LIBAOM_TEST_DATA_PATH=../-test-data make testdata
|
||||
$ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata
|
||||
|
||||
CODE STYLE:
|
||||
The coding style used by this project is enforced with clang-format using the
|
||||
@@ -144,5 +145,5 @@ CODE STYLE:
|
||||
|
||||
SUPPORT
|
||||
This library is an open source project supported by its community. Please
|
||||
please email webm-discuss@webmproject.org for help.
|
||||
email webm-discuss@webmproject.org for help.
|
||||
|
||||
|
||||
160
aom/aom.h
160
aom/aom.h
@@ -1,160 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
/*!\defgroup aom AOM
|
||||
* \ingroup codecs
|
||||
* AOM is aom's newest video compression algorithm that uses motion
|
||||
* compensated prediction, Discrete Cosine Transform (DCT) coding of the
|
||||
* prediction error signal and context dependent entropy coding techniques
|
||||
* based on arithmetic principles. It features:
|
||||
* - YUV 4:2:0 image format
|
||||
* - Macro-block based coding (16x16 luma plus two 8x8 chroma)
|
||||
* - 1/4 (1/8) pixel accuracy motion compensated prediction
|
||||
* - 4x4 DCT transform
|
||||
* - 128 level linear quantizer
|
||||
* - In loop deblocking filter
|
||||
* - Context-based entropy coding
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
/*!\file
|
||||
* \brief Provides controls common to both the AOM encoder and decoder.
|
||||
*/
|
||||
#ifndef AOM_AOM_H_
|
||||
#define AOM_AOM_H_
|
||||
|
||||
#include "./aom_codec.h"
|
||||
#include "./aom_image.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*!\brief Control functions
|
||||
*
|
||||
* The set of macros define the control functions of AOM interface
|
||||
*/
|
||||
enum aom_com_control_id {
|
||||
/*!\brief pass in an external frame into decoder to be used as reference frame
|
||||
*/
|
||||
AOM_SET_REFERENCE = 1,
|
||||
AOM_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
|
||||
AOM_SET_POSTPROC = 3, /**< set the decoder's post processing settings */
|
||||
AOM_SET_DBG_COLOR_REF_FRAME =
|
||||
4, /**< set the reference frames to color for each macroblock */
|
||||
AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
|
||||
AOM_SET_DBG_COLOR_B_MODES = 6, /**< set which blocks modes to color */
|
||||
AOM_SET_DBG_DISPLAY_MV = 7, /**< set which motion vector modes to draw */
|
||||
|
||||
/* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
|
||||
* for its control ids. These should be migrated to something like the
|
||||
* AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
|
||||
*/
|
||||
AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
|
||||
AOM_COMMON_CTRL_ID_MAX,
|
||||
|
||||
AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
|
||||
|
||||
AOM_DECODER_CTRL_ID_START = 256
|
||||
};
|
||||
|
||||
/*!\brief post process flags
|
||||
*
|
||||
* The set of macros define AOM decoder post processing flags
|
||||
*/
|
||||
enum aom_postproc_level {
|
||||
AOM_NOFILTERING = 0,
|
||||
AOM_DEBLOCK = 1 << 0,
|
||||
AOM_DEMACROBLOCK = 1 << 1,
|
||||
AOM_ADDNOISE = 1 << 2,
|
||||
AOM_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
|
||||
AOM_DEBUG_TXT_MBLK_MODES =
|
||||
1 << 4, /**< print macro block modes over each macro block */
|
||||
AOM_DEBUG_TXT_DC_DIFF = 1 << 5, /**< print dc diff for each macro block */
|
||||
AOM_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
|
||||
AOM_MFQE = 1 << 10
|
||||
};
|
||||
|
||||
/*!\brief post process flags
|
||||
*
|
||||
* This define a structure that describe the post processing settings. For
|
||||
* the best objective measure (using the PSNR metric) set post_proc_flag
|
||||
* to AOM_DEBLOCK and deblocking_level to 1.
|
||||
*/
|
||||
|
||||
typedef struct aom_postproc_cfg {
|
||||
/*!\brief the types of post processing to be done, should be combination of
|
||||
* "aom_postproc_level" */
|
||||
int post_proc_flag;
|
||||
int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
|
||||
int noise_level; /**< the strength of additive noise, valid range [0, 16] */
|
||||
} aom_postproc_cfg_t;
|
||||
|
||||
/*!\brief reference frame type
|
||||
*
|
||||
* The set of macros define the type of AOM reference frames
|
||||
*/
|
||||
typedef enum aom_ref_frame_type {
|
||||
AOM_LAST_FRAME = 1,
|
||||
AOM_GOLD_FRAME = 2,
|
||||
AOM_ALTR_FRAME = 4
|
||||
} aom_ref_frame_type_t;
|
||||
|
||||
/*!\brief reference frame data struct
|
||||
*
|
||||
* Define the data struct to access aom reference frames.
|
||||
*/
|
||||
typedef struct aom_ref_frame {
|
||||
aom_ref_frame_type_t frame_type; /**< which reference frame */
|
||||
aom_image_t img; /**< reference frame data in image format */
|
||||
} aom_ref_frame_t;
|
||||
|
||||
/*!\brief AV1 specific reference frame data struct
|
||||
*
|
||||
* Define the data struct to access av1 reference frames.
|
||||
*/
|
||||
typedef struct av1_ref_frame {
|
||||
int idx; /**< frame index to get (input) */
|
||||
aom_image_t img; /**< img structure to populate (output) */
|
||||
} av1_ref_frame_t;
|
||||
|
||||
/*!\cond */
|
||||
/*!\brief aom decoder control function parameter type
|
||||
*
|
||||
* defines the data type for each of AOM decoder control function requires
|
||||
*/
|
||||
AOM_CTRL_USE_TYPE(AOM_SET_REFERENCE, aom_ref_frame_t *)
|
||||
#define AOM_CTRL_AOM_SET_REFERENCE
|
||||
AOM_CTRL_USE_TYPE(AOM_COPY_REFERENCE, aom_ref_frame_t *)
|
||||
#define AOM_CTRL_AOM_COPY_REFERENCE
|
||||
AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
|
||||
#define AOM_CTRL_AOM_SET_POSTPROC
|
||||
AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
|
||||
#define AOM_CTRL_AOM_SET_DBG_COLOR_REF_FRAME
|
||||
AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_MB_MODES, int)
|
||||
#define AOM_CTRL_AOM_SET_DBG_COLOR_MB_MODES
|
||||
AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_B_MODES, int)
|
||||
#define AOM_CTRL_AOM_SET_DBG_COLOR_B_MODES
|
||||
AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
|
||||
#define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
|
||||
AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
|
||||
#define AOM_CTRL_AV1_GET_REFERENCE
|
||||
AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
|
||||
#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
|
||||
|
||||
/*!\endcond */
|
||||
/*! @} - end defgroup aom */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_AOM_H_
|
||||
@@ -1,42 +0,0 @@
|
||||
##
|
||||
## Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
##
|
||||
## This source code is subject to the terms of the BSD 2 Clause License and
|
||||
## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
## was not distributed with this source code in the LICENSE file, you can
|
||||
## obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
## Media Patent License 1.0 was not distributed with this source code in the
|
||||
## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
##
|
||||
|
||||
|
||||
API_EXPORTS += exports
|
||||
|
||||
API_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
|
||||
API_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
|
||||
API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aom.h
|
||||
API_DOC_SRCS-$(CONFIG_AV1_ENCODER) += aomcx.h
|
||||
|
||||
API_SRCS-$(CONFIG_AV1_DECODER) += aom.h
|
||||
API_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
|
||||
API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aom.h
|
||||
API_DOC_SRCS-$(CONFIG_AV1_DECODER) += aomdx.h
|
||||
|
||||
API_DOC_SRCS-yes += aom_codec.h
|
||||
API_DOC_SRCS-yes += aom_decoder.h
|
||||
API_DOC_SRCS-yes += aom_encoder.h
|
||||
API_DOC_SRCS-yes += aom_frame_buffer.h
|
||||
API_DOC_SRCS-yes += aom_image.h
|
||||
|
||||
API_SRCS-yes += src/aom_decoder.c
|
||||
API_SRCS-yes += aom_decoder.h
|
||||
API_SRCS-yes += src/aom_encoder.c
|
||||
API_SRCS-yes += aom_encoder.h
|
||||
API_SRCS-yes += internal/aom_codec_internal.h
|
||||
API_SRCS-yes += src/aom_codec.c
|
||||
API_SRCS-yes += src/aom_image.c
|
||||
API_SRCS-yes += aom_codec.h
|
||||
API_SRCS-yes += aom_codec.mk
|
||||
API_SRCS-yes += aom_frame_buffer.h
|
||||
API_SRCS-yes += aom_image.h
|
||||
API_SRCS-yes += aom_integer.h
|
||||
759
aom/aomcx.h
759
aom/aomcx.h
@@ -1,759 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#ifndef AOM_AOMCX_H_
|
||||
#define AOM_AOMCX_H_
|
||||
|
||||
/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
|
||||
* \ingroup aom
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
#include "./aom.h"
|
||||
#include "./aom_encoder.h"
|
||||
|
||||
/*!\file
|
||||
* \brief Provides definitions for using AOM or AV1 encoder algorithm within the
|
||||
* aom Codec Interface.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*!\name Algorithm interface for AV1
|
||||
*
|
||||
* This interface provides the capability to encode raw AV1 streams.
|
||||
* @{
|
||||
*/
|
||||
extern aom_codec_iface_t aom_codec_av1_cx_algo;
|
||||
extern aom_codec_iface_t *aom_codec_av1_cx(void);
|
||||
/*!@} - end algorithm interface member group*/
|
||||
|
||||
/*
|
||||
* Algorithm Flags
|
||||
*/
|
||||
|
||||
/*!\brief Don't reference the last frame
|
||||
*
|
||||
* When this flag is set, the encoder will not use the last frame as a
|
||||
* predictor. When not set, the encoder will choose whether to use the
|
||||
* last frame or not automatically.
|
||||
*/
|
||||
#define AOM_EFLAG_NO_REF_LAST (1 << 16)
|
||||
|
||||
/*!\brief Don't reference the golden frame
|
||||
*
|
||||
* When this flag is set, the encoder will not use the golden frame as a
|
||||
* predictor. When not set, the encoder will choose whether to use the
|
||||
* golden frame or not automatically.
|
||||
*/
|
||||
#define AOM_EFLAG_NO_REF_GF (1 << 17)
|
||||
|
||||
/*!\brief Don't reference the alternate reference frame
|
||||
*
|
||||
* When this flag is set, the encoder will not use the alt ref frame as a
|
||||
* predictor. When not set, the encoder will choose whether to use the
|
||||
* alt ref frame or not automatically.
|
||||
*/
|
||||
#define AOM_EFLAG_NO_REF_ARF (1 << 21)
|
||||
|
||||
/*!\brief Don't update the last frame
|
||||
*
|
||||
* When this flag is set, the encoder will not update the last frame with
|
||||
* the contents of the current frame.
|
||||
*/
|
||||
#define AOM_EFLAG_NO_UPD_LAST (1 << 18)
|
||||
|
||||
/*!\brief Don't update the golden frame
|
||||
*
|
||||
* When this flag is set, the encoder will not update the golden frame with
|
||||
* the contents of the current frame.
|
||||
*/
|
||||
#define AOM_EFLAG_NO_UPD_GF (1 << 22)
|
||||
|
||||
/*!\brief Don't update the alternate reference frame
|
||||
*
|
||||
* When this flag is set, the encoder will not update the alt ref frame with
|
||||
* the contents of the current frame.
|
||||
*/
|
||||
#define AOM_EFLAG_NO_UPD_ARF (1 << 23)
|
||||
|
||||
/*!\brief Force golden frame update
|
||||
*
|
||||
* When this flag is set, the encoder copy the contents of the current frame
|
||||
* to the golden frame buffer.
|
||||
*/
|
||||
#define AOM_EFLAG_FORCE_GF (1 << 19)
|
||||
|
||||
/*!\brief Force alternate reference frame update
|
||||
*
|
||||
* When this flag is set, the encoder copy the contents of the current frame
|
||||
* to the alternate reference frame buffer.
|
||||
*/
|
||||
#define AOM_EFLAG_FORCE_ARF (1 << 24)
|
||||
|
||||
/*!\brief Disable entropy update
|
||||
*
|
||||
* When this flag is set, the encoder will not update its internal entropy
|
||||
* model based on the entropy of this frame.
|
||||
*/
|
||||
#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 20)
|
||||
|
||||
/*!\brief AVx encoder control functions
|
||||
*
|
||||
* This set of macros define the control functions available for AVx
|
||||
* encoder interface.
|
||||
*
|
||||
* \sa #aom_codec_control
|
||||
*/
|
||||
enum aome_enc_control_id {
|
||||
/*!\brief Codec control function to set which reference frame encoder can use.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_USE_REFERENCE = 7,
|
||||
|
||||
/*!\brief Codec control function to pass an ROI map to encoder.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_ROI_MAP = 8,
|
||||
|
||||
/*!\brief Codec control function to pass an Active map to encoder.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_ACTIVEMAP,
|
||||
|
||||
/*!\brief Codec control function to set encoder scaling mode.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_SCALEMODE = 11,
|
||||
|
||||
/*!\brief Codec control function to set encoder internal speed settings.
|
||||
*
|
||||
* Changes in this value influences, among others, the encoder's selection
|
||||
* of motion estimation methods. Values greater than 0 will increase encoder
|
||||
* speed at the expense of quality.
|
||||
*
|
||||
* \note Valid range for VP8: -16..16
|
||||
* \note Valid range for AV1: -8..8
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_CPUUSED = 13,
|
||||
|
||||
/*!\brief Codec control function to enable automatic set and use alf frames.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_ENABLEAUTOALTREF,
|
||||
|
||||
#if CONFIG_EXT_REFS
|
||||
/*!\brief Codec control function to enable automatic set and use
|
||||
* bwd-pred frames.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AOME_SET_ENABLEAUTOBWDREF,
|
||||
#endif // CONFIG_EXT_REFS
|
||||
|
||||
/*!\brief control function to set noise sensitivity
|
||||
*
|
||||
* 0: off, 1: OnYOnly, 2: OnYUV,
|
||||
* 3: OnYUVAggressive, 4: Adaptive
|
||||
*
|
||||
* Supported in codecs: VP8
|
||||
*/
|
||||
AOME_SET_NOISE_SENSITIVITY,
|
||||
|
||||
/*!\brief Codec control function to set sharpness.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_SHARPNESS,
|
||||
|
||||
/*!\brief Codec control function to set the threshold for MBs treated static.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_STATIC_THRESHOLD,
|
||||
|
||||
/*!\brief Codec control function to set the number of token partitions.
|
||||
*
|
||||
* Supported in codecs: VP8
|
||||
*/
|
||||
AOME_SET_TOKEN_PARTITIONS,
|
||||
|
||||
/*!\brief Codec control function to get last quantizer chosen by the encoder.
|
||||
*
|
||||
* Return value uses internal quantizer scale defined by the codec.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_GET_LAST_QUANTIZER,
|
||||
|
||||
/*!\brief Codec control function to get last quantizer chosen by the encoder.
|
||||
*
|
||||
* Return value uses the 0..63 scale as used by the rc_*_quantizer config
|
||||
* parameters.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_GET_LAST_QUANTIZER_64,
|
||||
|
||||
/*!\brief Codec control function to set the max no of frames to create arf.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_ARNR_MAXFRAMES,
|
||||
|
||||
/*!\brief Codec control function to set the filter strength for the arf.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_ARNR_STRENGTH,
|
||||
|
||||
/*!\deprecated control function to set the filter type to use for the arf. */
|
||||
AOME_SET_ARNR_TYPE,
|
||||
|
||||
/*!\brief Codec control function to set visual tuning.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_TUNING,
|
||||
|
||||
/*!\brief Codec control function to set constrained quality level.
|
||||
*
|
||||
* \attention For this value to be used aom_codec_enc_cfg_t::g_usage must be
|
||||
* set to #AOM_CQ.
|
||||
* \note Valid range: 0..63
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_CQ_LEVEL,
|
||||
|
||||
/*!\brief Codec control function to set Max data rate for Intra frames.
|
||||
*
|
||||
* This value controls additional clamping on the maximum size of a
|
||||
* keyframe. It is expressed as a percentage of the average
|
||||
* per-frame bitrate, with the special (and default) value 0 meaning
|
||||
* unlimited, or no additional clamping beyond the codec's built-in
|
||||
* algorithm.
|
||||
*
|
||||
* For example, to allocate no more than 4.5 frames worth of bitrate
|
||||
* to a keyframe, set this to 450.
|
||||
*
|
||||
* Supported in codecs: VP8, AV1
|
||||
*/
|
||||
AOME_SET_MAX_INTRA_BITRATE_PCT,
|
||||
|
||||
/*!\brief Codec control function to set reference and update frame flags.
|
||||
*
|
||||
* Supported in codecs: VP8
|
||||
*/
|
||||
AOME_SET_FRAME_FLAGS,
|
||||
|
||||
/*!\brief Codec control function to set max data rate for Inter frames.
|
||||
*
|
||||
* This value controls additional clamping on the maximum size of an
|
||||
* inter frame. It is expressed as a percentage of the average
|
||||
* per-frame bitrate, with the special (and default) value 0 meaning
|
||||
* unlimited, or no additional clamping beyond the codec's built-in
|
||||
* algorithm.
|
||||
*
|
||||
* For example, to allow no more than 4.5 frames worth of bitrate
|
||||
* to an inter frame, set this to 450.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_MAX_INTER_BITRATE_PCT,
|
||||
|
||||
/*!\brief Boost percentage for Golden Frame in CBR mode.
|
||||
*
|
||||
* This value controls the amount of boost given to Golden Frame in
|
||||
* CBR mode. It is expressed as a percentage of the average
|
||||
* per-frame bitrate, with the special (and default) value 0 meaning
|
||||
* the feature is off, i.e., no golden frame boost in CBR mode and
|
||||
* average bitrate target is used.
|
||||
*
|
||||
* For example, to allow 100% more bits, i.e, 2X, in a golden frame
|
||||
* than average frame, set this to 100.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_GF_CBR_BOOST_PCT,
|
||||
|
||||
/*!\brief Codec control function to set encoder screen content mode.
|
||||
*
|
||||
* 0: off, 1: On, 2: On with more aggressive rate control.
|
||||
*
|
||||
* Supported in codecs: VP8
|
||||
*/
|
||||
AOME_SET_SCREEN_CONTENT_MODE,
|
||||
|
||||
/*!\brief Codec control function to set lossless encoding mode.
|
||||
*
|
||||
* AV1 can operate in lossless encoding mode, in which the bitstream
|
||||
* produced will be able to decode and reconstruct a perfect copy of
|
||||
* input source. This control function provides a mean to switch encoder
|
||||
* into lossless coding mode(1) or normal coding mode(0) that may be lossy.
|
||||
* 0 = lossy coding mode
|
||||
* 1 = lossless coding mode
|
||||
*
|
||||
* By default, encoder operates in normal coding mode (maybe lossy).
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_LOSSLESS,
|
||||
#if CONFIG_AOM_QM
|
||||
/*!\brief Codec control function to encode with quantisation matrices.
|
||||
*
|
||||
* AOM can operate with default quantisation matrices dependent on
|
||||
* quantisation level and block type.
|
||||
* 0 = do not use quantisation matrices
|
||||
* 1 = use quantisation matrices
|
||||
*
|
||||
* By default, the encoder operates without quantisation matrices.
|
||||
*
|
||||
* Supported in codecs: AOM
|
||||
*/
|
||||
|
||||
AV1E_SET_ENABLE_QM,
|
||||
|
||||
/*!\brief Codec control function to set the min quant matrix flatness.
|
||||
*
|
||||
* AOM can operate with different ranges of quantisation matrices.
|
||||
* As quantisation levels increase, the matrices get flatter. This
|
||||
* control sets the minimum level of flatness from which the matrices
|
||||
* are determined.
|
||||
*
|
||||
* By default, the encoder sets this minimum at half the available
|
||||
* range.
|
||||
*
|
||||
* Supported in codecs: AOM
|
||||
*/
|
||||
AV1E_SET_QM_MIN,
|
||||
|
||||
/*!\brief Codec control function to set the max quant matrix flatness.
|
||||
*
|
||||
* AOM can operate with different ranges of quantisation matrices.
|
||||
* As quantisation levels increase, the matrices get flatter. This
|
||||
* control sets the maximum level of flatness possible.
|
||||
*
|
||||
* By default, the encoder sets this maximum at the top of the
|
||||
* available range.
|
||||
*
|
||||
* Supported in codecs: AOM
|
||||
*/
|
||||
AV1E_SET_QM_MAX,
|
||||
#endif
|
||||
|
||||
/*!\brief Codec control function to set number of tile columns.
|
||||
*
|
||||
* In encoding and decoding, AV1 allows an input image frame be partitioned
|
||||
* into separated vertical tile columns, which can be encoded or decoded
|
||||
* independently. This enables easy implementation of parallel encoding and
|
||||
* decoding. This control requests the encoder to use column tiles in
|
||||
* encoding an input frame, with number of tile columns (in Log2 unit) as
|
||||
* the parameter:
|
||||
* 0 = 1 tile column
|
||||
* 1 = 2 tile columns
|
||||
* 2 = 4 tile columns
|
||||
* .....
|
||||
* n = 2**n tile columns
|
||||
* The requested tile columns will be capped by encoder based on image size
|
||||
* limitation (The minimum width of a tile column is 256 pixel, the maximum
|
||||
* is 4096).
|
||||
*
|
||||
* By default, the value is 0, i.e. one single column tile for entire image.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_TILE_COLUMNS,
|
||||
|
||||
/*!\brief Codec control function to set number of tile rows.
|
||||
*
|
||||
* In encoding and decoding, AV1 allows an input image frame be partitioned
|
||||
* into separated horizontal tile rows. Tile rows are encoded or decoded
|
||||
* sequentially. Even though encoding/decoding of later tile rows depends on
|
||||
* earlier ones, this allows the encoder to output data packets for tile rows
|
||||
* prior to completely processing all tile rows in a frame, thereby reducing
|
||||
* the latency in processing between input and output. The parameter
|
||||
* for this control describes the number of tile rows, which has a valid
|
||||
* range [0, 2]:
|
||||
* 0 = 1 tile row
|
||||
* 1 = 2 tile rows
|
||||
* 2 = 4 tile rows
|
||||
*
|
||||
* By default, the value is 0, i.e. one single row tile for entire image.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_TILE_ROWS,
|
||||
|
||||
/*!\brief Codec control function to enable frame parallel decoding feature.
|
||||
*
|
||||
* AV1 has a bitstream feature to reduce decoding dependency between frames
|
||||
* by turning off backward update of probability context used in encoding
|
||||
* and decoding. This allows staged parallel processing of more than one
|
||||
* video frames in the decoder. This control function provides a mean to
|
||||
* turn this feature on or off for bitstreams produced by encoder.
|
||||
*
|
||||
* By default, this feature is off.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_FRAME_PARALLEL_DECODING,
|
||||
|
||||
/*!\brief Codec control function to set adaptive quantization mode.
|
||||
*
|
||||
* AV1 has a segment based feature that allows encoder to adaptively change
|
||||
* quantization parameter for each segment within a frame to improve the
|
||||
* subjective quality. This control makes encoder operate in one of the
|
||||
* several AQ_modes supported.
|
||||
*
|
||||
* By default, encoder operates with AQ_Mode 0(adaptive quantization off).
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_AQ_MODE,
|
||||
|
||||
/*!\brief Codec control function to enable/disable periodic Q boost.
|
||||
*
|
||||
* One AV1 encoder speed feature is to enable quality boost by lowering
|
||||
* frame level Q periodically. This control function provides a mean to
|
||||
* turn on/off this feature.
|
||||
* 0 = off
|
||||
* 1 = on
|
||||
*
|
||||
* By default, the encoder is allowed to use this feature for appropriate
|
||||
* encoding modes.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_FRAME_PERIODIC_BOOST,
|
||||
|
||||
/*!\brief Codec control function to set noise sensitivity.
|
||||
*
|
||||
* 0: off, 1: On(YOnly)
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_NOISE_SENSITIVITY,
|
||||
|
||||
/*!\brief Codec control function to set content type.
|
||||
* \note Valid parameter range:
|
||||
* AOM_CONTENT_DEFAULT = Regular video content (Default)
|
||||
* AOM_CONTENT_SCREEN = Screen capture content
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_TUNE_CONTENT,
|
||||
|
||||
/*!\brief Codec control function to set color space info.
|
||||
* \note Valid ranges: 0..7, default is "UNKNOWN".
|
||||
* 0 = UNKNOWN,
|
||||
* 1 = BT_601
|
||||
* 2 = BT_709
|
||||
* 3 = SMPTE_170
|
||||
* 4 = SMPTE_240
|
||||
* 5 = BT_2020
|
||||
* 6 = RESERVED
|
||||
* 7 = SRGB
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_COLOR_SPACE,
|
||||
|
||||
/*!\brief Codec control function to set minimum interval between GF/ARF frames
|
||||
*
|
||||
* By default the value is set as 4.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_MIN_GF_INTERVAL,
|
||||
|
||||
/*!\brief Codec control function to set minimum interval between GF/ARF frames
|
||||
*
|
||||
* By default the value is set as 16.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_MAX_GF_INTERVAL,
|
||||
|
||||
/*!\brief Codec control function to get an Active map back from the encoder.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_GET_ACTIVEMAP,
|
||||
|
||||
/*!\brief Codec control function to set color range bit.
|
||||
* \note Valid ranges: 0..1, default is 0
|
||||
* 0 = Limited range (16..235 or HBD equivalent)
|
||||
* 1 = Full range (0..255 or HBD equivalent)
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_COLOR_RANGE,
|
||||
|
||||
/*!\brief Codec control function to set intended rendering image size.
|
||||
*
|
||||
* By default, this is identical to the image size in pixels.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_RENDER_SIZE,
|
||||
|
||||
/*!\brief Codec control function to set target level.
|
||||
*
|
||||
* 255: off (default); 0: only keep level stats; 10: target for level 1.0;
|
||||
* 11: target for level 1.1; ... 62: target for level 6.2
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_TARGET_LEVEL,
|
||||
|
||||
/*!\brief Codec control function to get bitstream level.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_GET_LEVEL,
|
||||
|
||||
/*!\brief Codec control function to set intended superblock size.
|
||||
*
|
||||
* By default, the superblock size is determined separately for each
|
||||
* frame by the encoder.
|
||||
*
|
||||
* Supported in codecs: AV1
|
||||
*/
|
||||
AV1E_SET_SUPERBLOCK_SIZE,
|
||||
};
|
||||
|
||||
/*!\brief aom 1-D scaling mode
|
||||
*
|
||||
* This set of constants define 1-D aom scaling modes
|
||||
*/
|
||||
typedef enum aom_scaling_mode_1d {
|
||||
AOME_NORMAL = 0,
|
||||
AOME_FOURFIVE = 1,
|
||||
AOME_THREEFIVE = 2,
|
||||
AOME_ONETWO = 3
|
||||
} AOM_SCALING_MODE;
|
||||
|
||||
/*!\brief aom region of interest map
|
||||
*
|
||||
* These defines the data structures for the region of interest map
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct aom_roi_map {
|
||||
/*! An id between 0 and 3 for each 16x16 region within a frame. */
|
||||
unsigned char *roi_map;
|
||||
unsigned int rows; /**< Number of rows. */
|
||||
unsigned int cols; /**< Number of columns. */
|
||||
// TODO(paulwilkins): broken for AV1 which has 8 segments
|
||||
// q and loop filter deltas for each segment
|
||||
// (see MAX_MB_SEGMENTS)
|
||||
int delta_q[4]; /**< Quantizer deltas. */
|
||||
int delta_lf[4]; /**< Loop filter deltas. */
|
||||
/*! Static breakout threshold for each segment. */
|
||||
unsigned int static_threshold[4];
|
||||
} aom_roi_map_t;
|
||||
|
||||
/*!\brief aom active region map
|
||||
*
|
||||
* These defines the data structures for active region map
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct aom_active_map {
|
||||
/*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
|
||||
unsigned char *active_map;
|
||||
unsigned int rows; /**< number of rows */
|
||||
unsigned int cols; /**< number of cols */
|
||||
} aom_active_map_t;
|
||||
|
||||
/*!\brief aom image scaling mode
|
||||
*
|
||||
* This defines the data structure for image scaling mode
|
||||
*
|
||||
*/
|
||||
typedef struct aom_scaling_mode {
|
||||
AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
|
||||
AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */
|
||||
} aom_scaling_mode_t;
|
||||
|
||||
/*!\brief VP8 token partition mode
|
||||
*
|
||||
* This defines VP8 partitioning mode for compressed data, i.e., the number of
|
||||
* sub-streams in the bitstream. Used for parallelized decoding.
|
||||
*
|
||||
*/
|
||||
|
||||
typedef enum {
|
||||
AOM_ONE_TOKENPARTITION = 0,
|
||||
AOM_TWO_TOKENPARTITION = 1,
|
||||
AOM_FOUR_TOKENPARTITION = 2,
|
||||
AOM_EIGHT_TOKENPARTITION = 3
|
||||
} aome_token_partitions;
|
||||
|
||||
/*!brief AV1 encoder content type */
|
||||
typedef enum {
|
||||
AOM_CONTENT_DEFAULT,
|
||||
AOM_CONTENT_SCREEN,
|
||||
AOM_CONTENT_INVALID
|
||||
} aom_tune_content;
|
||||
|
||||
/*!\brief VP8 model tuning parameters
|
||||
*
|
||||
* Changes the encoder to tune for certain types of input material.
|
||||
*
|
||||
*/
|
||||
typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
|
||||
|
||||
/*!\cond */
|
||||
/*!\brief VP8 encoder control function parameter type
|
||||
*
|
||||
* Defines the data types that VP8E control functions take. Note that
|
||||
* additional common controls are defined in aom.h
|
||||
*
|
||||
*/
|
||||
|
||||
AOM_CTRL_USE_TYPE_DEPRECATED(AOME_USE_REFERENCE, int)
|
||||
#define AOM_CTRL_AOME_USE_REFERENCE
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_FRAME_FLAGS, int)
|
||||
#define AOM_CTRL_AOME_SET_FRAME_FLAGS
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
|
||||
#define AOM_CTRL_AOME_SET_ROI_MAP
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
|
||||
#define AOM_CTRL_AOME_SET_ACTIVEMAP
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
|
||||
#define AOM_CTRL_AOME_SET_SCALEMODE
|
||||
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
|
||||
#define AOM_CTRL_AOME_SET_CPUUSED
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
|
||||
|
||||
#if CONFIG_EXT_REFS
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
|
||||
#endif // CONFIG_EXT_REFS
|
||||
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_NOISE_SENSITIVITY, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_NOISE_SENSITIVITY
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_SHARPNESS
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_TOKEN_PARTITIONS, int) /* aome_token_partitions */
|
||||
#define AOM_CTRL_AOME_SET_TOKEN_PARTITIONS
|
||||
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_ARNR_STRENGTH
|
||||
AOM_CTRL_USE_TYPE_DEPRECATED(AOME_SET_ARNR_TYPE, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_ARNR_TYPE
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
|
||||
#define AOM_CTRL_AOME_SET_TUNING
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_CQ_LEVEL
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
|
||||
#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
|
||||
#define AOM_CTRL_AV1E_SET_TILE_ROWS
|
||||
|
||||
AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
|
||||
#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
|
||||
AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
|
||||
#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
|
||||
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
|
||||
|
||||
AOM_CTRL_USE_TYPE(AOME_SET_SCREEN_CONTENT_MODE, unsigned int)
|
||||
#define AOM_CTRL_AOME_SET_SCREEN_CONTENT_MODE
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_LOSSLESS
|
||||
|
||||
#if CONFIG_AOM_QM
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_ENABLE_QM
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_QM_MIN
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_QM_MAX
|
||||
#endif
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_AQ_MODE
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
|
||||
#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_SPACE, int)
|
||||
#define AOM_CTRL_AV1E_SET_COLOR_SPACE
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
|
||||
#define AOM_CTRL_AV1E_GET_ACTIVEMAP
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
|
||||
#define AOM_CTRL_AV1E_SET_COLOR_RANGE
|
||||
|
||||
/*!\brief
|
||||
*
|
||||
* TODO(rbultje) : add support of the control in ffmpeg
|
||||
*/
|
||||
#define AOM_CTRL_AV1E_SET_RENDER_SIZE
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
|
||||
#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
|
||||
|
||||
AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
|
||||
#define AOM_CTRL_AV1E_GET_LEVEL
|
||||
/*!\endcond */
|
||||
/*! @} - end defgroup vp8_encoder */
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_AOMCX_H_
|
||||
191
aom/aomdx.h
191
aom/aomdx.h
@@ -1,191 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
|
||||
* \ingroup aom
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
/*!\file
|
||||
* \brief Provides definitions for using AOM or AV1 within the aom Decoder
|
||||
* interface.
|
||||
*/
|
||||
#ifndef AOM_AOMDX_H_
|
||||
#define AOM_AOMDX_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Include controls common to both the encoder and decoder */
|
||||
#include "./aom.h"
|
||||
|
||||
/*!\name Algorithm interface for AV1
|
||||
*
|
||||
* This interface provides the capability to decode AV1 streams.
|
||||
* @{
|
||||
*/
|
||||
extern aom_codec_iface_t aom_codec_av1_dx_algo;
|
||||
extern aom_codec_iface_t *aom_codec_av1_dx(void);
|
||||
/*!@} - end algorithm interface member group*/
|
||||
|
||||
/** Data structure that stores bit accounting for debug
|
||||
*/
|
||||
typedef struct Accounting Accounting;
|
||||
|
||||
/*!\enum aom_dec_control_id
|
||||
* \brief AOM decoder control functions
|
||||
*
|
||||
* This set of macros define the control functions available for the AOM
|
||||
* decoder interface.
|
||||
*
|
||||
* \sa #aom_codec_control
|
||||
*/
|
||||
enum aom_dec_control_id {
|
||||
/** control function to get info on which reference frames were updated
|
||||
* by the last decode
|
||||
*/
|
||||
AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
|
||||
|
||||
/** check if the indicated frame is corrupted */
|
||||
AOMD_GET_FRAME_CORRUPTED,
|
||||
|
||||
/** control function to get info on which reference frames were used
|
||||
* by the last decode
|
||||
*/
|
||||
AOMD_GET_LAST_REF_USED,
|
||||
|
||||
/** decryption function to decrypt encoded buffer data immediately
|
||||
* before decoding. Takes a aom_decrypt_init, which contains
|
||||
* a callback function and opaque context pointer.
|
||||
*/
|
||||
AOMD_SET_DECRYPTOR,
|
||||
// AOMD_SET_DECRYPTOR = AOMD_SET_DECRYPTOR,
|
||||
|
||||
/** control function to get the dimensions that the current frame is decoded
|
||||
* at. This may be different to the intended display size for the frame as
|
||||
* specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
|
||||
AV1D_GET_FRAME_SIZE,
|
||||
|
||||
/** control function to get the current frame's intended display dimensions
|
||||
* (as specified in the wrapper or frame header). This may be different to
|
||||
* the decoded dimensions of this frame (see AV1D_GET_FRAME_SIZE). */
|
||||
AV1D_GET_DISPLAY_SIZE,
|
||||
|
||||
/** control function to get the bit depth of the stream. */
|
||||
AV1D_GET_BIT_DEPTH,
|
||||
|
||||
/** control function to set the byte alignment of the planes in the reference
|
||||
* buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
|
||||
* legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
|
||||
* follows Y plane, and V plane directly follows U plane. Default value is 0.
|
||||
*/
|
||||
AV1_SET_BYTE_ALIGNMENT,
|
||||
|
||||
/** control function to invert the decoding order to from right to left. The
|
||||
* function is used in a test to confirm the decoding independence of tile
|
||||
* columns. The function may be used in application where this order
|
||||
* of decoding is desired.
|
||||
*
|
||||
* TODO(yaowu): Rework the unit test that uses this control, and in a future
|
||||
* release, this test-only control shall be removed.
|
||||
*/
|
||||
AV1_INVERT_TILE_DECODE_ORDER,
|
||||
|
||||
/** control function to set the skip loop filter flag. Valid values are
|
||||
* integers. The decoder will skip the loop filter when its value is set to
|
||||
* nonzero. If the loop filter is skipped the decoder may accumulate decode
|
||||
* artifacts. The default value is 0.
|
||||
*/
|
||||
AV1_SET_SKIP_LOOP_FILTER,
|
||||
|
||||
/** control function to retrieve a pointer to the Accounting struct. When
|
||||
* compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
|
||||
* If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
|
||||
* The caller should ensure that AOM_CODEC_OK is returned before attempting
|
||||
* to dereference the Accounting pointer.
|
||||
*/
|
||||
AV1_GET_ACCOUNTING,
|
||||
|
||||
AOM_DECODER_CTRL_ID_MAX,
|
||||
|
||||
/** control function to set the range of tile decoding. A value that is
|
||||
* greater and equal to zero indicates only the specific row/column is
|
||||
* decoded. A value that is -1 indicates the whole row/column is decoded.
|
||||
* A special case is both values are -1 that means the whole frame is
|
||||
* decoded.
|
||||
*/
|
||||
AV1_SET_DECODE_TILE_ROW,
|
||||
AV1_SET_DECODE_TILE_COL
|
||||
};
|
||||
|
||||
/** Decrypt n bytes of data from input -> output, using the decrypt_state
|
||||
* passed in AOMD_SET_DECRYPTOR.
|
||||
*/
|
||||
typedef void (*aom_decrypt_cb)(void *decrypt_state, const unsigned char *input,
|
||||
unsigned char *output, int count);
|
||||
|
||||
/*!\brief Structure to hold decryption state
|
||||
*
|
||||
* Defines a structure to hold the decryption state and access function.
|
||||
*/
|
||||
typedef struct aom_decrypt_init {
|
||||
/*! Decrypt callback. */
|
||||
aom_decrypt_cb decrypt_cb;
|
||||
|
||||
/*! Decryption state. */
|
||||
void *decrypt_state;
|
||||
} aom_decrypt_init;
|
||||
|
||||
/*!\brief A deprecated alias for aom_decrypt_init.
|
||||
*/
|
||||
typedef aom_decrypt_init aom_decrypt_init;
|
||||
|
||||
/*!\cond */
|
||||
/*!\brief AOM decoder control function parameter type
|
||||
*
|
||||
* Defines the data types that AOMD control functions take. Note that
|
||||
* additional common controls are defined in aom.h
|
||||
*
|
||||
*/
|
||||
|
||||
AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
|
||||
#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
|
||||
AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
|
||||
#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
|
||||
AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
|
||||
#define AOM_CTRL_AOMD_GET_LAST_REF_USED
|
||||
AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
|
||||
#define AOM_CTRL_AOMD_SET_DECRYPTOR
|
||||
// AOM_CTRL_USE_TYPE(AOMD_SET_DECRYPTOR, aom_decrypt_init *)
|
||||
//#define AOM_CTRL_AOMD_SET_DECRYPTOR
|
||||
AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
|
||||
#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
|
||||
AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
|
||||
#define AOM_CTRL_AV1D_GET_BIT_DEPTH
|
||||
AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
|
||||
#define AOM_CTRL_AV1D_GET_FRAME_SIZE
|
||||
AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
|
||||
#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
|
||||
AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
|
||||
#define AOM_CTRL_AV1_GET_ACCOUNTING
|
||||
AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
|
||||
#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
|
||||
AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
|
||||
#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
|
||||
/*!\endcond */
|
||||
/*! @} - end defgroup aom_decoder */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_AOMDX_H_
|
||||
@@ -1,16 +0,0 @@
|
||||
text aom_codec_build_config
|
||||
text aom_codec_control_
|
||||
text aom_codec_destroy
|
||||
text aom_codec_err_to_string
|
||||
text aom_codec_error
|
||||
text aom_codec_error_detail
|
||||
text aom_codec_get_caps
|
||||
text aom_codec_iface_name
|
||||
text aom_codec_version
|
||||
text aom_codec_version_extra_str
|
||||
text aom_codec_version_str
|
||||
text aom_img_alloc
|
||||
text aom_img_flip
|
||||
text aom_img_free
|
||||
text aom_img_set_rect
|
||||
text aom_img_wrap
|
||||
@@ -1,8 +0,0 @@
|
||||
text aom_codec_dec_init_ver
|
||||
text aom_codec_decode
|
||||
text aom_codec_get_frame
|
||||
text aom_codec_get_stream_info
|
||||
text aom_codec_peek_stream_info
|
||||
text aom_codec_register_put_frame_cb
|
||||
text aom_codec_register_put_slice_cb
|
||||
text aom_codec_set_frame_buffer_functions
|
||||
@@ -1,9 +0,0 @@
|
||||
text aom_codec_enc_config_default
|
||||
text aom_codec_enc_config_set
|
||||
text aom_codec_enc_init_multi_ver
|
||||
text aom_codec_enc_init_ver
|
||||
text aom_codec_encode
|
||||
text aom_codec_get_cx_data
|
||||
text aom_codec_get_global_headers
|
||||
text aom_codec_get_preview_frame
|
||||
text aom_codec_set_cx_data_buf
|
||||
@@ -1,134 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
/*!\file
|
||||
* \brief Provides the high level interface to wrap decoder algorithms.
|
||||
*
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom/internal/aom_codec_internal.h"
|
||||
#include "aom_version.h"
|
||||
|
||||
#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
|
||||
|
||||
int aom_codec_version(void) { return VERSION_PACKED; }
|
||||
|
||||
const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
|
||||
|
||||
const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
|
||||
|
||||
const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
|
||||
return iface ? iface->name : "<invalid interface>";
|
||||
}
|
||||
|
||||
const char *aom_codec_err_to_string(aom_codec_err_t err) {
|
||||
switch (err) {
|
||||
case AOM_CODEC_OK: return "Success";
|
||||
case AOM_CODEC_ERROR: return "Unspecified internal error";
|
||||
case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
|
||||
case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
|
||||
case AOM_CODEC_INCAPABLE:
|
||||
return "Codec does not implement requested capability";
|
||||
case AOM_CODEC_UNSUP_BITSTREAM:
|
||||
return "Bitstream not supported by this decoder";
|
||||
case AOM_CODEC_UNSUP_FEATURE:
|
||||
return "Bitstream required feature not supported by this decoder";
|
||||
case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
|
||||
case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
|
||||
case AOM_CODEC_LIST_END: return "End of iterated list";
|
||||
}
|
||||
|
||||
return "Unrecognized error code";
|
||||
}
|
||||
|
||||
const char *aom_codec_error(aom_codec_ctx_t *ctx) {
|
||||
return (ctx) ? aom_codec_err_to_string(ctx->err)
|
||||
: aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
|
||||
}
|
||||
|
||||
const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
|
||||
if (ctx && ctx->err)
|
||||
return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (!ctx)
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else if (!ctx->iface || !ctx->priv)
|
||||
res = AOM_CODEC_ERROR;
|
||||
else {
|
||||
ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
|
||||
|
||||
ctx->iface = NULL;
|
||||
ctx->name = NULL;
|
||||
ctx->priv = NULL;
|
||||
res = AOM_CODEC_OK;
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
|
||||
aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
|
||||
return (iface) ? iface->caps : 0;
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (!ctx || !ctrl_id)
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
|
||||
res = AOM_CODEC_ERROR;
|
||||
else {
|
||||
aom_codec_ctrl_fn_map_t *entry;
|
||||
|
||||
res = AOM_CODEC_ERROR;
|
||||
|
||||
for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
|
||||
if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, ctrl_id);
|
||||
res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
|
||||
va_end(ap);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
|
||||
void aom_internal_error(struct aom_internal_error_info *info,
|
||||
aom_codec_err_t error, const char *fmt, ...) {
|
||||
va_list ap;
|
||||
|
||||
info->error_code = error;
|
||||
info->has_detail = 0;
|
||||
|
||||
if (fmt) {
|
||||
size_t sz = sizeof(info->detail);
|
||||
|
||||
info->has_detail = 1;
|
||||
va_start(ap, fmt);
|
||||
vsnprintf(info->detail, sz - 1, fmt, ap);
|
||||
va_end(ap);
|
||||
info->detail[sz - 1] = '\0';
|
||||
}
|
||||
|
||||
if (info->setjmp) longjmp(info->jmp, info->error_code);
|
||||
}
|
||||
@@ -1,189 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
/*!\file
|
||||
* \brief Provides the high level interface to wrap decoder algorithms.
|
||||
*
|
||||
*/
|
||||
#include <string.h>
|
||||
#include "aom/internal/aom_codec_internal.h"
|
||||
|
||||
#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
|
||||
|
||||
static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
|
||||
return (aom_codec_alg_priv_t *)ctx->priv;
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
|
||||
aom_codec_iface_t *iface,
|
||||
const aom_codec_dec_cfg_t *cfg,
|
||||
aom_codec_flags_t flags, int ver) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (ver != AOM_DECODER_ABI_VERSION)
|
||||
res = AOM_CODEC_ABI_MISMATCH;
|
||||
else if (!ctx || !iface)
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
|
||||
res = AOM_CODEC_ABI_MISMATCH;
|
||||
else if ((flags & AOM_CODEC_USE_POSTPROC) &&
|
||||
!(iface->caps & AOM_CODEC_CAP_POSTPROC))
|
||||
res = AOM_CODEC_INCAPABLE;
|
||||
else if ((flags & AOM_CODEC_USE_ERROR_CONCEALMENT) &&
|
||||
!(iface->caps & AOM_CODEC_CAP_ERROR_CONCEALMENT))
|
||||
res = AOM_CODEC_INCAPABLE;
|
||||
else if ((flags & AOM_CODEC_USE_INPUT_FRAGMENTS) &&
|
||||
!(iface->caps & AOM_CODEC_CAP_INPUT_FRAGMENTS))
|
||||
res = AOM_CODEC_INCAPABLE;
|
||||
else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
|
||||
res = AOM_CODEC_INCAPABLE;
|
||||
else {
|
||||
memset(ctx, 0, sizeof(*ctx));
|
||||
ctx->iface = iface;
|
||||
ctx->name = iface->name;
|
||||
ctx->priv = NULL;
|
||||
ctx->init_flags = flags;
|
||||
ctx->config.dec = cfg;
|
||||
|
||||
res = ctx->iface->init(ctx, NULL);
|
||||
if (res) {
|
||||
ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
|
||||
aom_codec_destroy(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
|
||||
const uint8_t *data,
|
||||
unsigned int data_sz,
|
||||
aom_codec_stream_info_t *si) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (!iface || !data || !data_sz || !si ||
|
||||
si->sz < sizeof(aom_codec_stream_info_t))
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else {
|
||||
/* Set default/unknown values */
|
||||
si->w = 0;
|
||||
si->h = 0;
|
||||
|
||||
res = iface->dec.peek_si(data, data_sz, si);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
|
||||
aom_codec_stream_info_t *si) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (!ctx || !si || si->sz < sizeof(aom_codec_stream_info_t))
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else if (!ctx->iface || !ctx->priv)
|
||||
res = AOM_CODEC_ERROR;
|
||||
else {
|
||||
/* Set default/unknown values */
|
||||
si->w = 0;
|
||||
si->h = 0;
|
||||
|
||||
res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
|
||||
unsigned int data_sz, void *user_priv,
|
||||
long deadline) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
/* Sanity checks */
|
||||
/* NULL data ptr allowed if data_sz is 0 too */
|
||||
if (!ctx || (!data && data_sz) || (data && !data_sz))
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else if (!ctx->iface || !ctx->priv)
|
||||
res = AOM_CODEC_ERROR;
|
||||
else {
|
||||
res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
|
||||
deadline);
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
|
||||
aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
|
||||
aom_image_t *img;
|
||||
|
||||
if (!ctx || !iter || !ctx->iface || !ctx->priv)
|
||||
img = NULL;
|
||||
else
|
||||
img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
|
||||
|
||||
return img;
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
|
||||
aom_codec_put_frame_cb_fn_t cb,
|
||||
void *user_priv) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (!ctx || !cb)
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else if (!ctx->iface || !ctx->priv ||
|
||||
!(ctx->iface->caps & AOM_CODEC_CAP_PUT_FRAME))
|
||||
res = AOM_CODEC_ERROR;
|
||||
else {
|
||||
ctx->priv->dec.put_frame_cb.u.put_frame = cb;
|
||||
ctx->priv->dec.put_frame_cb.user_priv = user_priv;
|
||||
res = AOM_CODEC_OK;
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
|
||||
aom_codec_put_slice_cb_fn_t cb,
|
||||
void *user_priv) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (!ctx || !cb)
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
else if (!ctx->iface || !ctx->priv ||
|
||||
!(ctx->iface->caps & AOM_CODEC_CAP_PUT_SLICE))
|
||||
res = AOM_CODEC_ERROR;
|
||||
else {
|
||||
ctx->priv->dec.put_slice_cb.u.put_slice = cb;
|
||||
ctx->priv->dec.put_slice_cb.user_priv = user_priv;
|
||||
res = AOM_CODEC_OK;
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
|
||||
aom_codec_err_t aom_codec_set_frame_buffer_functions(
|
||||
aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
|
||||
aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
|
||||
aom_codec_err_t res;
|
||||
|
||||
if (!ctx || !cb_get || !cb_release) {
|
||||
res = AOM_CODEC_INVALID_PARAM;
|
||||
} else if (!ctx->iface || !ctx->priv ||
|
||||
!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
|
||||
res = AOM_CODEC_ERROR;
|
||||
} else {
|
||||
res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
|
||||
cb_priv);
|
||||
}
|
||||
|
||||
return SAVE_STATUS(ctx, res);
|
||||
}
|
||||
@@ -1,240 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "aom/aom_image.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_mem/aom_mem.h"
|
||||
|
||||
static aom_image_t *img_alloc_helper(aom_image_t *img, aom_img_fmt_t fmt,
|
||||
unsigned int d_w, unsigned int d_h,
|
||||
unsigned int buf_align,
|
||||
unsigned int stride_align,
|
||||
unsigned char *img_data) {
|
||||
unsigned int h, w, s, xcs, ycs, bps;
|
||||
unsigned int stride_in_bytes;
|
||||
int align;
|
||||
|
||||
/* Treat align==0 like align==1 */
|
||||
if (!buf_align) buf_align = 1;
|
||||
|
||||
/* Validate alignment (must be power of 2) */
|
||||
if (buf_align & (buf_align - 1)) goto fail;
|
||||
|
||||
/* Treat align==0 like align==1 */
|
||||
if (!stride_align) stride_align = 1;
|
||||
|
||||
/* Validate alignment (must be power of 2) */
|
||||
if (stride_align & (stride_align - 1)) goto fail;
|
||||
|
||||
/* Get sample size for this format */
|
||||
switch (fmt) {
|
||||
case AOM_IMG_FMT_RGB32:
|
||||
case AOM_IMG_FMT_RGB32_LE:
|
||||
case AOM_IMG_FMT_ARGB:
|
||||
case AOM_IMG_FMT_ARGB_LE: bps = 32; break;
|
||||
case AOM_IMG_FMT_RGB24:
|
||||
case AOM_IMG_FMT_BGR24: bps = 24; break;
|
||||
case AOM_IMG_FMT_RGB565:
|
||||
case AOM_IMG_FMT_RGB565_LE:
|
||||
case AOM_IMG_FMT_RGB555:
|
||||
case AOM_IMG_FMT_RGB555_LE:
|
||||
case AOM_IMG_FMT_UYVY:
|
||||
case AOM_IMG_FMT_YUY2:
|
||||
case AOM_IMG_FMT_YVYU: bps = 16; break;
|
||||
case AOM_IMG_FMT_I420:
|
||||
case AOM_IMG_FMT_YV12:
|
||||
case AOM_IMG_FMT_AOMI420:
|
||||
case AOM_IMG_FMT_AOMYV12: bps = 12; break;
|
||||
case AOM_IMG_FMT_I422:
|
||||
case AOM_IMG_FMT_I440: bps = 16; break;
|
||||
case AOM_IMG_FMT_I444: bps = 24; break;
|
||||
case AOM_IMG_FMT_I42016: bps = 24; break;
|
||||
case AOM_IMG_FMT_I42216:
|
||||
case AOM_IMG_FMT_I44016: bps = 32; break;
|
||||
case AOM_IMG_FMT_I44416: bps = 48; break;
|
||||
default: bps = 16; break;
|
||||
}
|
||||
|
||||
/* Get chroma shift values for this format */
|
||||
switch (fmt) {
|
||||
case AOM_IMG_FMT_I420:
|
||||
case AOM_IMG_FMT_YV12:
|
||||
case AOM_IMG_FMT_AOMI420:
|
||||
case AOM_IMG_FMT_AOMYV12:
|
||||
case AOM_IMG_FMT_I422:
|
||||
case AOM_IMG_FMT_I42016:
|
||||
case AOM_IMG_FMT_I42216: xcs = 1; break;
|
||||
default: xcs = 0; break;
|
||||
}
|
||||
|
||||
switch (fmt) {
|
||||
case AOM_IMG_FMT_I420:
|
||||
case AOM_IMG_FMT_I440:
|
||||
case AOM_IMG_FMT_YV12:
|
||||
case AOM_IMG_FMT_AOMI420:
|
||||
case AOM_IMG_FMT_AOMYV12:
|
||||
case AOM_IMG_FMT_I42016:
|
||||
case AOM_IMG_FMT_I44016: ycs = 1; break;
|
||||
default: ycs = 0; break;
|
||||
}
|
||||
|
||||
/* Calculate storage sizes given the chroma subsampling */
|
||||
align = (1 << xcs) - 1;
|
||||
w = (d_w + align) & ~align;
|
||||
align = (1 << ycs) - 1;
|
||||
h = (d_h + align) & ~align;
|
||||
s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
|
||||
s = (s + stride_align - 1) & ~(stride_align - 1);
|
||||
stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
|
||||
|
||||
/* Allocate the new image */
|
||||
if (!img) {
|
||||
img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
|
||||
|
||||
if (!img) goto fail;
|
||||
|
||||
img->self_allocd = 1;
|
||||
} else {
|
||||
memset(img, 0, sizeof(aom_image_t));
|
||||
}
|
||||
|
||||
img->img_data = img_data;
|
||||
|
||||
if (!img_data) {
|
||||
const uint64_t alloc_size = (fmt & AOM_IMG_FMT_PLANAR)
|
||||
? (uint64_t)h * s * bps / 8
|
||||
: (uint64_t)h * s;
|
||||
|
||||
if (alloc_size != (size_t)alloc_size) goto fail;
|
||||
|
||||
img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
|
||||
img->img_data_owner = 1;
|
||||
}
|
||||
|
||||
if (!img->img_data) goto fail;
|
||||
|
||||
img->fmt = fmt;
|
||||
img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
|
||||
img->w = w;
|
||||
img->h = h;
|
||||
img->x_chroma_shift = xcs;
|
||||
img->y_chroma_shift = ycs;
|
||||
img->bps = bps;
|
||||
|
||||
/* Calculate strides */
|
||||
img->stride[AOM_PLANE_Y] = img->stride[AOM_PLANE_ALPHA] = stride_in_bytes;
|
||||
img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
|
||||
|
||||
/* Default viewport to entire image */
|
||||
if (!aom_img_set_rect(img, 0, 0, d_w, d_h)) return img;
|
||||
|
||||
fail:
|
||||
aom_img_free(img);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
|
||||
unsigned int d_w, unsigned int d_h,
|
||||
unsigned int align) {
|
||||
return img_alloc_helper(img, fmt, d_w, d_h, align, align, NULL);
|
||||
}
|
||||
|
||||
aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
|
||||
unsigned int d_h, unsigned int stride_align,
|
||||
unsigned char *img_data) {
|
||||
/* By setting buf_align = 1, we don't change buffer alignment in this
|
||||
* function. */
|
||||
return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data);
|
||||
}
|
||||
|
||||
int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
|
||||
unsigned int w, unsigned int h) {
|
||||
unsigned char *data;
|
||||
|
||||
if (x + w <= img->w && y + h <= img->h) {
|
||||
img->d_w = w;
|
||||
img->d_h = h;
|
||||
|
||||
/* Calculate plane pointers */
|
||||
if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
|
||||
img->planes[AOM_PLANE_PACKED] =
|
||||
img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
|
||||
} else {
|
||||
const int bytes_per_sample =
|
||||
(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
|
||||
data = img->img_data;
|
||||
|
||||
if (img->fmt & AOM_IMG_FMT_HAS_ALPHA) {
|
||||
img->planes[AOM_PLANE_ALPHA] =
|
||||
data + x * bytes_per_sample + y * img->stride[AOM_PLANE_ALPHA];
|
||||
data += img->h * img->stride[AOM_PLANE_ALPHA];
|
||||
}
|
||||
|
||||
img->planes[AOM_PLANE_Y] =
|
||||
data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
|
||||
data += img->h * img->stride[AOM_PLANE_Y];
|
||||
|
||||
if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
|
||||
img->planes[AOM_PLANE_U] =
|
||||
data + (x >> img->x_chroma_shift) * bytes_per_sample +
|
||||
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
|
||||
data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
|
||||
img->planes[AOM_PLANE_V] =
|
||||
data + (x >> img->x_chroma_shift) * bytes_per_sample +
|
||||
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
|
||||
} else {
|
||||
img->planes[AOM_PLANE_V] =
|
||||
data + (x >> img->x_chroma_shift) * bytes_per_sample +
|
||||
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
|
||||
data += (img->h >> img->y_chroma_shift) * img->stride[AOM_PLANE_V];
|
||||
img->planes[AOM_PLANE_U] =
|
||||
data + (x >> img->x_chroma_shift) * bytes_per_sample +
|
||||
(y >> img->y_chroma_shift) * img->stride[AOM_PLANE_U];
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
void aom_img_flip(aom_image_t *img) {
|
||||
/* Note: In the calculation pointer adjustment calculation, we want the
|
||||
* rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
|
||||
* standard indicates that if the adjustment parameter is unsigned, the
|
||||
* stride parameter will be promoted to unsigned, causing errors when
|
||||
* the lhs is a larger type than the rhs.
|
||||
*/
|
||||
img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
|
||||
img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
|
||||
|
||||
img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
|
||||
img->stride[AOM_PLANE_U];
|
||||
img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
|
||||
|
||||
img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
|
||||
img->stride[AOM_PLANE_V];
|
||||
img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
|
||||
|
||||
img->planes[AOM_PLANE_ALPHA] +=
|
||||
(signed)(img->d_h - 1) * img->stride[AOM_PLANE_ALPHA];
|
||||
img->stride[AOM_PLANE_ALPHA] = -img->stride[AOM_PLANE_ALPHA];
|
||||
}
|
||||
|
||||
void aom_img_free(aom_image_t *img) {
|
||||
if (img) {
|
||||
if (img->img_data && img->img_data_owner) aom_free(img->img_data);
|
||||
|
||||
if (img->self_allocd) free(img);
|
||||
}
|
||||
}
|
||||
@@ -1,64 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/ans.h"
|
||||
#include "aom_dsp/prob.h"
|
||||
|
||||
static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
|
||||
int largest_idx = -1;
|
||||
int largest_p = -1;
|
||||
int i;
|
||||
for (i = 0; i < num_syms; ++i) {
|
||||
int p = pdf_tab[i];
|
||||
if (p > largest_p) {
|
||||
largest_p = p;
|
||||
largest_idx = i;
|
||||
}
|
||||
}
|
||||
return largest_idx;
|
||||
}
|
||||
|
||||
void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
|
||||
const AnsP8 node_prob,
|
||||
const aom_cdf_prob *const src_pdf, int in_syms) {
|
||||
int i;
|
||||
int adjustment = RANS_PRECISION;
|
||||
const int round_fact = ANS_P8_PRECISION >> 1;
|
||||
const AnsP8 p1 = ANS_P8_PRECISION - node_prob;
|
||||
const int out_syms = in_syms + 1;
|
||||
assert(src_pdf != out_pdf);
|
||||
|
||||
out_pdf[0] = node_prob << (RANS_PROB_BITS - ANS_P8_SHIFT);
|
||||
adjustment -= out_pdf[0];
|
||||
for (i = 0; i < in_syms; ++i) {
|
||||
int p = (p1 * src_pdf[i] + round_fact) >> ANS_P8_SHIFT;
|
||||
p = AOMMIN(p, (int)RANS_PRECISION - in_syms);
|
||||
p = AOMMAX(p, 1);
|
||||
out_pdf[i + 1] = p;
|
||||
adjustment -= p;
|
||||
}
|
||||
|
||||
// Adjust probabilities so they sum to the total probability
|
||||
if (adjustment > 0) {
|
||||
i = find_largest(out_pdf, out_syms);
|
||||
out_pdf[i] += adjustment;
|
||||
} else {
|
||||
while (adjustment < 0) {
|
||||
i = find_largest(out_pdf, out_syms);
|
||||
--out_pdf[i];
|
||||
assert(out_pdf[i] > 0);
|
||||
adjustment++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_ANS_H_
|
||||
#define AOM_DSP_ANS_H_
|
||||
// Constants, types and utilities for Asymmetric Numeral Systems
|
||||
// http://arxiv.org/abs/1311.2540v2
|
||||
|
||||
#include <assert.h>
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/prob.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
typedef uint8_t AnsP8;
|
||||
#define ANS_P8_PRECISION 256u
|
||||
#define ANS_P8_SHIFT 8
|
||||
#define RANS_PROB_BITS 15
|
||||
#define RANS_PRECISION (1u << RANS_PROB_BITS)
|
||||
|
||||
// L_BASE % PRECISION must be 0. Increasing L_BASE beyond 2**15 will cause uabs
|
||||
// to overflow.
|
||||
#define L_BASE (RANS_PRECISION)
|
||||
#define IO_BASE 256
|
||||
// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
|
||||
|
||||
void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
|
||||
const AnsP8 node_prob,
|
||||
const aom_cdf_prob *const src_pdf, int in_syms);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif // __cplusplus
|
||||
#endif // AOM_DSP_ANS_H_
|
||||
@@ -1,146 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_ANSREADER_H_
|
||||
#define AOM_DSP_ANSREADER_H_
|
||||
// A uABS and rANS decoder implementation of Asymmetric Numeral Systems
|
||||
// http://arxiv.org/abs/1311.2540v2
|
||||
|
||||
#include <assert.h>
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/prob.h"
|
||||
#include "aom_dsp/ans.h"
|
||||
#include "aom_ports/mem_ops.h"
|
||||
#if CONFIG_ACCOUNTING
|
||||
#include "av1/common/accounting.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
struct AnsDecoder {
|
||||
const uint8_t *buf;
|
||||
int buf_offset;
|
||||
uint32_t state;
|
||||
#if CONFIG_ACCOUNTING
|
||||
Accounting *accounting;
|
||||
#endif
|
||||
};
|
||||
|
||||
static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
|
||||
AnsP8 p = ANS_P8_PRECISION - p0;
|
||||
int s;
|
||||
unsigned xp, sp;
|
||||
unsigned state = ans->state;
|
||||
while (state < L_BASE && ans->buf_offset > 0) {
|
||||
state = state * IO_BASE + ans->buf[--ans->buf_offset];
|
||||
}
|
||||
sp = state * p;
|
||||
xp = sp / ANS_P8_PRECISION;
|
||||
s = (sp & 0xFF) >= p0;
|
||||
if (s)
|
||||
ans->state = xp;
|
||||
else
|
||||
ans->state = state - xp;
|
||||
return s;
|
||||
}
|
||||
|
||||
static INLINE int uabs_read_bit(struct AnsDecoder *ans) {
|
||||
int s;
|
||||
unsigned state = ans->state;
|
||||
while (state < L_BASE && ans->buf_offset > 0) {
|
||||
state = state * IO_BASE + ans->buf[--ans->buf_offset];
|
||||
}
|
||||
s = (int)(state & 1);
|
||||
ans->state = state >> 1;
|
||||
return s;
|
||||
}
|
||||
|
||||
struct rans_dec_sym {
|
||||
uint8_t val;
|
||||
aom_cdf_prob prob;
|
||||
aom_cdf_prob cum_prob; // not-inclusive
|
||||
};
|
||||
|
||||
static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
|
||||
aom_cdf_prob rem) {
|
||||
int i;
|
||||
aom_cdf_prob cum_prob = 0, top_prob;
|
||||
// TODO(skal): if critical, could be a binary search.
|
||||
// Or, better, an O(1) alias-table.
|
||||
for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
|
||||
cum_prob = top_prob;
|
||||
}
|
||||
out->val = i;
|
||||
out->prob = top_prob - cum_prob;
|
||||
out->cum_prob = cum_prob;
|
||||
}
|
||||
|
||||
static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
|
||||
unsigned rem;
|
||||
unsigned quo;
|
||||
struct rans_dec_sym sym;
|
||||
while (ans->state < L_BASE && ans->buf_offset > 0) {
|
||||
ans->state = ans->state * IO_BASE + ans->buf[--ans->buf_offset];
|
||||
}
|
||||
quo = ans->state / RANS_PRECISION;
|
||||
rem = ans->state % RANS_PRECISION;
|
||||
fetch_sym(&sym, tab, rem);
|
||||
ans->state = quo * sym.prob + rem - sym.cum_prob;
|
||||
return sym.val;
|
||||
}
|
||||
|
||||
static INLINE int ans_read_init(struct AnsDecoder *const ans,
|
||||
const uint8_t *const buf, int offset) {
|
||||
unsigned x;
|
||||
if (offset < 1) return 1;
|
||||
ans->buf = buf;
|
||||
x = buf[offset - 1] >> 6;
|
||||
if (x == 0) {
|
||||
ans->buf_offset = offset - 1;
|
||||
ans->state = buf[offset - 1] & 0x3F;
|
||||
} else if (x == 1) {
|
||||
if (offset < 2) return 1;
|
||||
ans->buf_offset = offset - 2;
|
||||
ans->state = mem_get_le16(buf + offset - 2) & 0x3FFF;
|
||||
} else if (x == 2) {
|
||||
if (offset < 3) return 1;
|
||||
ans->buf_offset = offset - 3;
|
||||
ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
|
||||
} else if ((buf[offset - 1] & 0xE0) == 0xE0) {
|
||||
if (offset < 4) return 1;
|
||||
ans->buf_offset = offset - 4;
|
||||
ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF;
|
||||
} else {
|
||||
// 110xxxxx implies this byte is a superframe marker
|
||||
return 1;
|
||||
}
|
||||
#if CONFIG_ACCOUNTING
|
||||
ans->accounting = NULL;
|
||||
#endif
|
||||
ans->state += L_BASE;
|
||||
if (ans->state >= L_BASE * IO_BASE) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static INLINE int ans_read_end(struct AnsDecoder *const ans) {
|
||||
return ans->state == L_BASE;
|
||||
}
|
||||
|
||||
static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
|
||||
return ans->state < L_BASE && ans->buf_offset == 0;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif // __cplusplus
|
||||
#endif // AOM_DSP_ANSREADER_H_
|
||||
@@ -1,120 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_ANSWRITER_H_
|
||||
#define AOM_DSP_ANSWRITER_H_
|
||||
// A uABS and rANS encoder implementation of Asymmetric Numeral Systems
|
||||
// http://arxiv.org/abs/1311.2540v2
|
||||
|
||||
#include <assert.h>
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/ans.h"
|
||||
#include "aom_dsp/prob.h"
|
||||
#include "aom_ports/mem_ops.h"
|
||||
#include "av1/common/odintrin.h"
|
||||
|
||||
#if RANS_PRECISION <= OD_DIVU_DMAX
|
||||
#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
|
||||
do { \
|
||||
quotient = OD_DIVU_SMALL((dividend), (divisor)); \
|
||||
remainder = (dividend) - (quotient) * (divisor); \
|
||||
} while (0)
|
||||
#else
|
||||
#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
|
||||
do { \
|
||||
quotient = (dividend) / (divisor); \
|
||||
remainder = (dividend) % (divisor); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
struct AnsCoder {
|
||||
uint8_t *buf;
|
||||
int buf_offset;
|
||||
uint32_t state;
|
||||
};
|
||||
|
||||
static INLINE void ans_write_init(struct AnsCoder *const ans,
|
||||
uint8_t *const buf) {
|
||||
ans->buf = buf;
|
||||
ans->buf_offset = 0;
|
||||
ans->state = L_BASE;
|
||||
}
|
||||
|
||||
static INLINE int ans_write_end(struct AnsCoder *const ans) {
|
||||
uint32_t state;
|
||||
assert(ans->state >= L_BASE);
|
||||
assert(ans->state < L_BASE * IO_BASE);
|
||||
state = ans->state - L_BASE;
|
||||
if (state < (1 << 6)) {
|
||||
ans->buf[ans->buf_offset] = (0x00 << 6) + state;
|
||||
return ans->buf_offset + 1;
|
||||
} else if (state < (1 << 14)) {
|
||||
mem_put_le16(ans->buf + ans->buf_offset, (0x01 << 14) + state);
|
||||
return ans->buf_offset + 2;
|
||||
} else if (state < (1 << 22)) {
|
||||
mem_put_le24(ans->buf + ans->buf_offset, (0x02 << 22) + state);
|
||||
return ans->buf_offset + 3;
|
||||
} else if (state < (1 << 29)) {
|
||||
mem_put_le32(ans->buf + ans->buf_offset, (0x07 << 29) + state);
|
||||
return ans->buf_offset + 4;
|
||||
} else {
|
||||
assert(0 && "State is too large to be serialized");
|
||||
return ans->buf_offset;
|
||||
}
|
||||
}
|
||||
|
||||
// uABS with normalization
|
||||
static INLINE void uabs_write(struct AnsCoder *ans, int val, AnsP8 p0) {
|
||||
AnsP8 p = ANS_P8_PRECISION - p0;
|
||||
const unsigned l_s = val ? p : p0;
|
||||
while (ans->state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) {
|
||||
ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
|
||||
ans->state /= IO_BASE;
|
||||
}
|
||||
if (!val)
|
||||
ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
|
||||
else
|
||||
ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
|
||||
}
|
||||
|
||||
struct rans_sym {
|
||||
aom_cdf_prob prob;
|
||||
aom_cdf_prob cum_prob; // not-inclusive
|
||||
};
|
||||
|
||||
// rANS with normalization
|
||||
// sym->prob takes the place of l_s from the paper
|
||||
// ANS_P10_PRECISION is m
|
||||
static INLINE void rans_write(struct AnsCoder *ans,
|
||||
const struct rans_sym *const sym) {
|
||||
const aom_cdf_prob p = sym->prob;
|
||||
unsigned quot, rem;
|
||||
while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
|
||||
ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
|
||||
ans->state /= IO_BASE;
|
||||
}
|
||||
ANS_DIVREM(quot, rem, ans->state, p);
|
||||
ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
|
||||
}
|
||||
|
||||
#undef ANS_DIV8
|
||||
#undef ANS_DIVREM
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif // __cplusplus
|
||||
#endif // AOM_DSP_ANSWRITER_H_
|
||||
@@ -1,57 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#ifndef AOM_DSP_AOM_CONVOLVE_H_
|
||||
#define AOM_DSP_AOM_CONVOLVE_H_
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Note: Fixed size intermediate buffers, place limits on parameters
|
||||
// of some functions. 2d filtering proceeds in 2 steps:
|
||||
// (1) Interpolate horizontally into an intermediate buffer, temp.
|
||||
// (2) Interpolate temp vertically to derive the sub-pixel result.
|
||||
// Deriving the maximum number of rows in the temp buffer (135):
|
||||
// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
|
||||
// --Largest block size is 64x64 pixels.
|
||||
// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
|
||||
// original frame (in 1/16th pixel units).
|
||||
// --Must round-up because block may be located at sub-pixel position.
|
||||
// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
|
||||
// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
|
||||
#if CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
#define MAX_EXT_SIZE 263
|
||||
#else
|
||||
#define MAX_EXT_SIZE 135
|
||||
#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
|
||||
typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4, int w,
|
||||
int h);
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
|
||||
uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *filter_x, int x_step_q4,
|
||||
const int16_t *filter_y, int y_step_q4,
|
||||
int w, int h, int bd);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_AOM_CONVOLVE_H_
|
||||
@@ -1,102 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_AOM_DSP_COMMON_H_
|
||||
#define AOM_DSP_AOM_DSP_COMMON_H_
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef MAX_SB_SIZE
|
||||
#if CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
#define MAX_SB_SIZE 128
|
||||
#else
|
||||
#define MAX_SB_SIZE 64
|
||||
#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
#endif // ndef MAX_SB_SIZE
|
||||
|
||||
#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
|
||||
#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b')
|
||||
|
||||
#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
|
||||
|
||||
// These can be used to give a hint about branch outcomes.
|
||||
// This can have an effect, even if your target processor has a
|
||||
// good branch predictor, as these hints can affect basic block
|
||||
// ordering by the compiler.
|
||||
#ifdef __GNUC__
|
||||
#define LIKELY(v) __builtin_expect(v, 1)
|
||||
#define UNLIKELY(v) __builtin_expect(v, 0)
|
||||
#else
|
||||
#define LIKELY(v) (v)
|
||||
#define UNLIKELY(v) (v)
|
||||
#endif
|
||||
|
||||
#define AOM_SWAP(type, a, b) \
|
||||
do { \
|
||||
type c = (b); \
|
||||
b = a; \
|
||||
a = c; \
|
||||
} while (0)
|
||||
|
||||
#if CONFIG_AOM_QM
|
||||
typedef uint16_t qm_val_t;
|
||||
#define AOM_QM_BITS 6
|
||||
#endif
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
// Note:
|
||||
// tran_low_t is the datatype used for final transform coefficients.
|
||||
// tran_high_t is the datatype used for intermediate transform stages.
|
||||
typedef int64_t tran_high_t;
|
||||
typedef int32_t tran_low_t;
|
||||
#else
|
||||
// Note:
|
||||
// tran_low_t is the datatype used for final transform coefficients.
|
||||
// tran_high_t is the datatype used for intermediate transform stages.
|
||||
typedef int32_t tran_high_t;
|
||||
typedef int16_t tran_low_t;
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
|
||||
static INLINE uint8_t clip_pixel(int val) {
|
||||
return (val > 255) ? 255 : (val < 0) ? 0 : val;
|
||||
}
|
||||
|
||||
static INLINE int clamp(int value, int low, int high) {
|
||||
return value < low ? low : (value > high ? high : value);
|
||||
}
|
||||
|
||||
static INLINE double fclamp(double value, double low, double high) {
|
||||
return value < low ? low : (value > high ? high : value);
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
|
||||
switch (bd) {
|
||||
case 8:
|
||||
default: return (uint16_t)clamp(val, 0, 255);
|
||||
case 10: return (uint16_t)clamp(val, 0, 1023);
|
||||
case 12: return (uint16_t)clamp(val, 0, 4095);
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_AOM_DSP_COMMON_H_
|
||||
@@ -1,16 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#include "./aom_config.h"
|
||||
#define RTCD_C
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "aom_ports/aom_once.h"
|
||||
|
||||
void aom_dsp_rtcd() { once(setup_rtcd_internal); }
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,43 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_AOM_FILTER_H_
|
||||
#define AOM_DSP_AOM_FILTER_H_
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define FILTER_BITS 7
|
||||
|
||||
#define SUBPEL_BITS 4
|
||||
#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
|
||||
#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
|
||||
#define SUBPEL_TAPS 8
|
||||
|
||||
typedef int16_t InterpKernel[SUBPEL_TAPS];
|
||||
|
||||
#define BIL_SUBPEL_BITS 3
|
||||
#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
|
||||
|
||||
// 2 tap bilinear filters
|
||||
static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
|
||||
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
|
||||
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_AOM_FILTER_H_
|
||||
@@ -1,13 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
// Set to 1 to add some sanity checks in the fallback C code
|
||||
const int simd_check = 1;
|
||||
@@ -1,32 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_AOM_AOM_SIMD_H_
|
||||
#define AOM_DSP_AOM_AOM_SIMD_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./aom_simd_inline.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
#include "simd/v256_intrinsics_arm.h"
|
||||
#elif HAVE_SSE2
|
||||
#include "simd/v256_intrinsics_x86.h"
|
||||
#else
|
||||
#include "simd/v256_intrinsics.h"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_AOM_AOM_SIMD_H_
|
||||
@@ -1,21 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_AOM_SIMD_INLINE_H_
|
||||
#define AOM_DSP_AOM_SIMD_INLINE_H_
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
#ifndef SIMD_INLINE
|
||||
#define SIMD_INLINE static AOM_FORCE_INLINE
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_AOM_SIMD_INLINE_H_
|
||||
@@ -1,240 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
;
|
||||
|
||||
|
||||
EXPORT |aom_filter_block2d_bil_first_pass_media|
|
||||
EXPORT |aom_filter_block2d_bil_second_pass_media|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 unsigned short *dst_ptr,
|
||||
; r2 unsigned int src_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *aom_filter
|
||||
;-------------------------------------
|
||||
; The output is transposed stroed in output array to make it easy for second pass filtering.
|
||||
|aom_filter_block2d_bil_first_pass_media| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; aom_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
mov r12, r3 ; outer-loop counter
|
||||
|
||||
add r7, r2, r4 ; preload next row
|
||||
pld [r0, r7]
|
||||
|
||||
sub r2, r2, r4 ; src increment for height loop
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
|
||||
mov r3, r3, lsl #1 ; height*2
|
||||
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
|
||||
|
||||
mov r11, r1 ; save dst_ptr for each row
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_1st_filter
|
||||
|
||||
|bil_height_loop_1st_v6|
|
||||
ldrb r6, [r0] ; load source data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|
||||
|
||||
|bil_width_loop_1st_v6|
|
||||
ldrb r9, [r0, #3]
|
||||
ldrb r10, [r0, #4]
|
||||
|
||||
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
|
||||
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
|
||||
|
||||
smuad r6, r6, r5 ; apply the filter
|
||||
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
|
||||
smuad r7, r7, r5
|
||||
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
|
||||
|
||||
smuad r8, r8, r5
|
||||
smuad r9, r9, r5
|
||||
|
||||
add r0, r0, #4
|
||||
subs lr, lr, #1
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #16, r6, asr #7
|
||||
usat r7, #16, r7, asr #7
|
||||
|
||||
strh r6, [r1], r3 ; result is transposed and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strh r7, [r1], r3
|
||||
add r9, r9, #0x40
|
||||
usat r8, #16, r8, asr #7
|
||||
usat r9, #16, r9, asr #7
|
||||
|
||||
strh r8, [r1], r3 ; result is transposed and stored
|
||||
|
||||
ldrneb r6, [r0] ; load source data
|
||||
strh r9, [r1], r3
|
||||
|
||||
ldrneb r7, [r0, #1]
|
||||
ldrneb r8, [r0, #2]
|
||||
|
||||
bne bil_width_loop_1st_v6
|
||||
|
||||
add r0, r0, r2 ; move to next input row
|
||||
subs r12, r12, #1
|
||||
|
||||
add r9, r2, r4, lsl #1 ; adding back block width
|
||||
pld [r0, r9] ; preload next row
|
||||
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_1st_v6
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_1st_filter|
|
||||
|bil_height_loop_null_1st|
|
||||
mov lr, r4, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_null_1st|
|
||||
ldrb r6, [r0] ; load data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
ldrb r9, [r0, #3]
|
||||
|
||||
strh r6, [r1], r3 ; store it to immediate buffer
|
||||
add r0, r0, #4
|
||||
strh r7, [r1], r3
|
||||
subs lr, lr, #1
|
||||
strh r8, [r1], r3
|
||||
strh r9, [r1], r3
|
||||
|
||||
bne bil_width_loop_null_1st
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r2 ; move to next input line
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_1st
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP ; |aom_filter_block2d_bil_first_pass_media|
|
||||
|
||||
|
||||
;---------------------------------
|
||||
; r0 unsigned short *src_ptr,
|
||||
; r1 unsigned char *dst_ptr,
|
||||
; r2 int dst_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *aom_filter
|
||||
;---------------------------------
|
||||
|aom_filter_block2d_bil_second_pass_media| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; aom_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
|
||||
mov r11, r1
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_2nd_filter
|
||||
|
||||
|bil_height_loop_2nd|
|
||||
ldr r6, [r0] ; load the data
|
||||
ldr r8, [r0, #4]
|
||||
ldrh r10, [r0, #8]
|
||||
mov lr, r3, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_2nd|
|
||||
pkhtb r7, r6, r8 ; src[1] | src[2]
|
||||
pkhtb r9, r8, r10 ; src[3] | src[4]
|
||||
|
||||
smuad r6, r6, r5 ; apply filter
|
||||
smuad r8, r8, r5 ; apply filter
|
||||
|
||||
subs lr, lr, #1
|
||||
|
||||
smuadx r7, r7, r5 ; apply filter
|
||||
smuadx r9, r9, r5 ; apply filter
|
||||
|
||||
add r0, r0, #8
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #8, r6, asr #7
|
||||
usat r7, #8, r7, asr #7
|
||||
strb r6, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strb r7, [r1], r2
|
||||
add r9, r9, #0x40
|
||||
usat r8, #8, r8, asr #7
|
||||
usat r9, #8, r9, asr #7
|
||||
strb r8, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
ldrne r6, [r0] ; load data
|
||||
strb r9, [r1], r2
|
||||
ldrne r8, [r0, #4]
|
||||
ldrneh r10, [r0, #8]
|
||||
|
||||
bne bil_width_loop_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4 ; update src for next row
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_2nd
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_2nd_filter|
|
||||
|bil_height_loop_null_2nd|
|
||||
mov lr, r3, lsr #2
|
||||
|
||||
|bil_width_loop_null_2nd|
|
||||
ldr r6, [r0], #4 ; load data
|
||||
subs lr, lr, #1
|
||||
ldr r8, [r0], #4
|
||||
|
||||
strb r6, [r1], r2 ; store data
|
||||
mov r7, r6, lsr #16
|
||||
strb r7, [r1], r2
|
||||
mov r9, r8, lsr #16
|
||||
strb r8, [r1], r2
|
||||
strb r9, [r1], r2
|
||||
|
||||
bne bil_width_loop_null_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_2nd
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |aom_filter_block2d_second_pass_media|
|
||||
|
||||
END
|
||||
@@ -1,199 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
|
||||
int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
|
||||
int16x8_t *a6, int16x8_t *a7) {
|
||||
const int16x8_t b0 = vaddq_s16(*a0, *a1);
|
||||
const int16x8_t b1 = vsubq_s16(*a0, *a1);
|
||||
const int16x8_t b2 = vaddq_s16(*a2, *a3);
|
||||
const int16x8_t b3 = vsubq_s16(*a2, *a3);
|
||||
const int16x8_t b4 = vaddq_s16(*a4, *a5);
|
||||
const int16x8_t b5 = vsubq_s16(*a4, *a5);
|
||||
const int16x8_t b6 = vaddq_s16(*a6, *a7);
|
||||
const int16x8_t b7 = vsubq_s16(*a6, *a7);
|
||||
|
||||
const int16x8_t c0 = vaddq_s16(b0, b2);
|
||||
const int16x8_t c1 = vaddq_s16(b1, b3);
|
||||
const int16x8_t c2 = vsubq_s16(b0, b2);
|
||||
const int16x8_t c3 = vsubq_s16(b1, b3);
|
||||
const int16x8_t c4 = vaddq_s16(b4, b6);
|
||||
const int16x8_t c5 = vaddq_s16(b5, b7);
|
||||
const int16x8_t c6 = vsubq_s16(b4, b6);
|
||||
const int16x8_t c7 = vsubq_s16(b5, b7);
|
||||
|
||||
*a0 = vaddq_s16(c0, c4);
|
||||
*a1 = vsubq_s16(c2, c6);
|
||||
*a2 = vsubq_s16(c0, c4);
|
||||
*a3 = vaddq_s16(c2, c6);
|
||||
*a4 = vaddq_s16(c3, c7);
|
||||
*a5 = vsubq_s16(c3, c7);
|
||||
*a6 = vsubq_s16(c1, c5);
|
||||
*a7 = vaddq_s16(c1, c5);
|
||||
}
|
||||
|
||||
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
|
||||
// reversing transpose order which may make it easier for the compiler to
|
||||
// reconcile the vtrn.64 moves.
|
||||
static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
|
||||
int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
|
||||
int16x8_t *a6, int16x8_t *a7) {
|
||||
// Swap 64 bit elements. Goes from:
|
||||
// a0: 00 01 02 03 04 05 06 07
|
||||
// a1: 08 09 10 11 12 13 14 15
|
||||
// a2: 16 17 18 19 20 21 22 23
|
||||
// a3: 24 25 26 27 28 29 30 31
|
||||
// a4: 32 33 34 35 36 37 38 39
|
||||
// a5: 40 41 42 43 44 45 46 47
|
||||
// a6: 48 49 50 51 52 53 54 55
|
||||
// a7: 56 57 58 59 60 61 62 63
|
||||
// to:
|
||||
// a04_lo: 00 01 02 03 32 33 34 35
|
||||
// a15_lo: 08 09 10 11 40 41 42 43
|
||||
// a26_lo: 16 17 18 19 48 49 50 51
|
||||
// a37_lo: 24 25 26 27 56 57 58 59
|
||||
// a04_hi: 04 05 06 07 36 37 38 39
|
||||
// a15_hi: 12 13 14 15 44 45 46 47
|
||||
// a26_hi: 20 21 22 23 52 53 54 55
|
||||
// a37_hi: 28 29 30 31 60 61 62 63
|
||||
const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
|
||||
const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
|
||||
const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
|
||||
const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
|
||||
const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
|
||||
const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
|
||||
const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
|
||||
const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
|
||||
|
||||
// Swap 32 bit elements resulting in:
|
||||
// a0246_lo:
|
||||
// 00 01 16 17 32 33 48 49
|
||||
// 02 03 18 19 34 35 50 51
|
||||
// a1357_lo:
|
||||
// 08 09 24 25 40 41 56 57
|
||||
// 10 11 26 27 42 43 58 59
|
||||
// a0246_hi:
|
||||
// 04 05 20 21 36 37 52 53
|
||||
// 06 07 22 23 38 39 54 55
|
||||
// a1657_hi:
|
||||
// 12 13 28 29 44 45 60 61
|
||||
// 14 15 30 31 46 47 62 63
|
||||
const int32x4x2_t a0246_lo =
|
||||
vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
|
||||
const int32x4x2_t a1357_lo =
|
||||
vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
|
||||
const int32x4x2_t a0246_hi =
|
||||
vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
|
||||
const int32x4x2_t a1357_hi =
|
||||
vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
|
||||
|
||||
// Swap 16 bit elements resulting in:
|
||||
// b0:
|
||||
// 00 08 16 24 32 40 48 56
|
||||
// 01 09 17 25 33 41 49 57
|
||||
// b1:
|
||||
// 02 10 18 26 34 42 50 58
|
||||
// 03 11 19 27 35 43 51 59
|
||||
// b2:
|
||||
// 04 12 20 28 36 44 52 60
|
||||
// 05 13 21 29 37 45 53 61
|
||||
// b3:
|
||||
// 06 14 22 30 38 46 54 62
|
||||
// 07 15 23 31 39 47 55 63
|
||||
const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
|
||||
vreinterpretq_s16_s32(a1357_lo.val[0]));
|
||||
const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
|
||||
vreinterpretq_s16_s32(a1357_lo.val[1]));
|
||||
const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
|
||||
vreinterpretq_s16_s32(a1357_hi.val[0]));
|
||||
const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
|
||||
vreinterpretq_s16_s32(a1357_hi.val[1]));
|
||||
|
||||
*a0 = b0.val[0];
|
||||
*a1 = b0.val[1];
|
||||
*a2 = b1.val[0];
|
||||
*a3 = b1.val[1];
|
||||
*a4 = b2.val[0];
|
||||
*a5 = b2.val[1];
|
||||
*a6 = b3.val[0];
|
||||
*a7 = b3.val[1];
|
||||
}
|
||||
|
||||
void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
|
||||
int16_t *coeff) {
|
||||
int16x8_t a0 = vld1q_s16(src_diff);
|
||||
int16x8_t a1 = vld1q_s16(src_diff + src_stride);
|
||||
int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
|
||||
int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
|
||||
int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
|
||||
int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
|
||||
int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
|
||||
int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
|
||||
|
||||
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
|
||||
|
||||
transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
|
||||
|
||||
hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
|
||||
|
||||
// Skip the second transpose because it is not required.
|
||||
|
||||
vst1q_s16(coeff + 0, a0);
|
||||
vst1q_s16(coeff + 8, a1);
|
||||
vst1q_s16(coeff + 16, a2);
|
||||
vst1q_s16(coeff + 24, a3);
|
||||
vst1q_s16(coeff + 32, a4);
|
||||
vst1q_s16(coeff + 40, a5);
|
||||
vst1q_s16(coeff + 48, a6);
|
||||
vst1q_s16(coeff + 56, a7);
|
||||
}
|
||||
|
||||
void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
|
||||
int16_t *coeff) {
|
||||
int i;
|
||||
|
||||
/* Rearrange 16x16 to 8x32 and remove stride.
|
||||
* Top left first. */
|
||||
aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
|
||||
/* Top right. */
|
||||
aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
|
||||
/* Bottom left. */
|
||||
aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
|
||||
/* Bottom right. */
|
||||
aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
|
||||
|
||||
for (i = 0; i < 64; i += 8) {
|
||||
const int16x8_t a0 = vld1q_s16(coeff + 0);
|
||||
const int16x8_t a1 = vld1q_s16(coeff + 64);
|
||||
const int16x8_t a2 = vld1q_s16(coeff + 128);
|
||||
const int16x8_t a3 = vld1q_s16(coeff + 192);
|
||||
|
||||
const int16x8_t b0 = vhaddq_s16(a0, a1);
|
||||
const int16x8_t b1 = vhsubq_s16(a0, a1);
|
||||
const int16x8_t b2 = vhaddq_s16(a2, a3);
|
||||
const int16x8_t b3 = vhsubq_s16(a2, a3);
|
||||
|
||||
const int16x8_t c0 = vaddq_s16(b0, b2);
|
||||
const int16x8_t c1 = vaddq_s16(b1, b3);
|
||||
const int16x8_t c2 = vsubq_s16(b0, b2);
|
||||
const int16x8_t c3 = vsubq_s16(b1, b3);
|
||||
|
||||
vst1q_s16(coeff + 0, c0);
|
||||
vst1q_s16(coeff + 64, c1);
|
||||
vst1q_s16(coeff + 128, c2);
|
||||
vst1q_s16(coeff + 192, c3);
|
||||
|
||||
coeff += 8;
|
||||
}
|
||||
}
|
||||
@@ -1,49 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0,
|
||||
const uint8_t *blimit1, const uint8_t *limit1,
|
||||
const uint8_t *thresh1) {
|
||||
aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
|
||||
aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
|
||||
}
|
||||
|
||||
#if HAVE_NEON_ASM
|
||||
void aom_lpf_horizontal_8_dual_neon(
|
||||
uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
|
||||
const uint8_t *limit1, const uint8_t *thresh1) {
|
||||
aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
|
||||
aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
|
||||
}
|
||||
|
||||
void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0,
|
||||
const uint8_t *limit0, const uint8_t *thresh0,
|
||||
const uint8_t *blimit1, const uint8_t *limit1,
|
||||
const uint8_t *thresh1) {
|
||||
aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
|
||||
aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
|
||||
}
|
||||
|
||||
void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
|
||||
const uint8_t *limit,
|
||||
const uint8_t *thresh) {
|
||||
aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
|
||||
aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
|
||||
}
|
||||
#endif // HAVE_NEON_ASM
|
||||
@@ -1,98 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
;
|
||||
|
||||
|
||||
EXPORT |aom_sad16x16_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 const unsigned char *src_ptr
|
||||
; r1 int src_stride
|
||||
; r2 const unsigned char *ref_ptr
|
||||
; r3 int ref_stride
|
||||
|aom_sad16x16_media| PROC
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
mov r4, #0 ; sad = 0;
|
||||
mov r5, #8 ; loop count
|
||||
|
||||
loop
|
||||
; 1st row
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
|
||||
ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
|
||||
ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
|
||||
ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
|
||||
ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
|
||||
ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
|
||||
|
||||
usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
|
||||
usad8 r8, r7, r9 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
|
||||
ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
|
||||
|
||||
add r0, r0, r1 ; set src pointer to next row
|
||||
add r2, r2, r3 ; set dst pointer to next row
|
||||
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
|
||||
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
|
||||
ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
|
||||
add r4, r4, r8 ; add partial sad values
|
||||
|
||||
; 2nd row
|
||||
ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
|
||||
ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
|
||||
ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
|
||||
ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
|
||||
|
||||
usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
|
||||
usad8 r8, r7, r9 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
|
||||
ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
|
||||
|
||||
add r0, r0, r1 ; set src pointer to next row
|
||||
add r2, r2, r3 ; set dst pointer to next row
|
||||
|
||||
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
|
||||
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
|
||||
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
subs r5, r5, #1 ; decrement loop counter
|
||||
add r4, r4, r8 ; add partial sad values
|
||||
|
||||
bne loop
|
||||
|
||||
mov r0, r4 ; return sad
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
;
|
||||
|
||||
|
||||
EXPORT |aom_push_neon|
|
||||
EXPORT |aom_pop_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
|aom_push_neon| PROC
|
||||
vst1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vst1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|aom_pop_neon| PROC
|
||||
vld1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vld1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
||||
@@ -1,81 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
#if HAVE_MEDIA
|
||||
static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
|
||||
{ 96, 32 }, { 80, 48 },
|
||||
{ 64, 64 }, { 48, 80 },
|
||||
{ 32, 96 }, { 16, 112 } };
|
||||
|
||||
extern void aom_filter_block2d_bil_first_pass_media(
|
||||
const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
|
||||
uint32_t height, uint32_t width, const int16_t *filter);
|
||||
|
||||
extern void aom_filter_block2d_bil_second_pass_media(
|
||||
const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
|
||||
uint32_t height, uint32_t width, const int16_t *filter);
|
||||
|
||||
unsigned int aom_sub_pixel_variance8x8_media(
|
||||
const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
|
||||
const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
|
||||
uint16_t first_pass[10 * 8];
|
||||
uint8_t second_pass[8 * 8];
|
||||
const int16_t *HFilter, *VFilter;
|
||||
|
||||
HFilter = bilinear_filters_media[xoffset];
|
||||
VFilter = bilinear_filters_media[yoffset];
|
||||
|
||||
aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
|
||||
src_pixels_per_line, 9, 8, HFilter);
|
||||
aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
|
||||
VFilter);
|
||||
|
||||
return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
|
||||
sse);
|
||||
}
|
||||
|
||||
unsigned int aom_sub_pixel_variance16x16_media(
|
||||
const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
|
||||
const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
|
||||
uint16_t first_pass[36 * 16];
|
||||
uint8_t second_pass[20 * 16];
|
||||
const int16_t *HFilter, *VFilter;
|
||||
unsigned int var;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0) {
|
||||
var = aom_variance_halfpixvar16x16_h_media(
|
||||
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
} else if (xoffset == 0 && yoffset == 4) {
|
||||
var = aom_variance_halfpixvar16x16_v_media(
|
||||
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
} else if (xoffset == 4 && yoffset == 4) {
|
||||
var = aom_variance_halfpixvar16x16_hv_media(
|
||||
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
} else {
|
||||
HFilter = bilinear_filters_media[xoffset];
|
||||
VFilter = bilinear_filters_media[yoffset];
|
||||
|
||||
aom_filter_block2d_bil_first_pass_media(
|
||||
src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
|
||||
aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
|
||||
16, VFilter);
|
||||
|
||||
var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
|
||||
sse);
|
||||
}
|
||||
return var;
|
||||
}
|
||||
#endif // HAVE_MEDIA
|
||||
@@ -1,185 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
;
|
||||
|
||||
|
||||
EXPORT |aom_variance_halfpixvar16x16_h_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|aom_variance_halfpixvar16x16_h_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
||||
|
||||
@@ -1,225 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
;
|
||||
|
||||
|
||||
EXPORT |aom_variance_halfpixvar16x16_hv_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|aom_variance_halfpixvar16x16_hv_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
add r9, r0, r1 ; pointer to pixels on the next row
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load source pixels a, row N
|
||||
ldr r6, [r0, #1] ; load source pixels b, row N
|
||||
ldr r5, [r9, #0] ; load source pixels c, row N+1
|
||||
ldr r7, [r9, #1] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load source pixels a, row N
|
||||
ldr r6, [r0, #5] ; load source pixels b, row N
|
||||
ldr r5, [r9, #4] ; load source pixels c, row N+1
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
ldr r7, [r9, #5] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load source pixels a, row N
|
||||
ldr r6, [r0, #9] ; load source pixels b, row N
|
||||
ldr r5, [r9, #8] ; load source pixels c, row N+1
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
ldr r7, [r9, #9] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load source pixels a, row N
|
||||
ldr r6, [r0, #13] ; load source pixels b, row N
|
||||
ldr r5, [r9, #12] ; load source pixels c, row N+1
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
ldr r7, [r9, #13] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
||||
@@ -1,187 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
;
|
||||
|
||||
|
||||
EXPORT |aom_variance_halfpixvar16x16_v_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|aom_variance_halfpixvar16x16_v_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
add r9, r0, r1 ; set src pointer to next row
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r6, [r9, #0] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r6, [r9, #4] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r6, [r9, #8] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r6, [r9, #12] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
||||
|
||||
@@ -1,361 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
;
|
||||
; This source code is subject to the terms of the BSD 2 Clause License and
|
||||
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
; was not distributed with this source code in the LICENSE file, you can
|
||||
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
; Media Patent License 1.0 was not distributed with this source code in the
|
||||
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
;
|
||||
|
||||
;
|
||||
|
||||
|
||||
EXPORT |aom_variance16x16_media|
|
||||
EXPORT |aom_variance8x8_media|
|
||||
EXPORT |aom_mse16x16_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|aom_variance16x16_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
|
||||
loop16x16
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop16x16
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|aom_variance8x8_media| PROC
|
||||
|
||||
push {r4-r10, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #8 ; set loop counter to 8 (=block height)
|
||||
mov r4, #0 ; initialize sum = 0
|
||||
mov r5, #0 ; initialize sse = 0
|
||||
|
||||
loop8x8
|
||||
; 1st 4 pixels
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r0, #0x4] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1 ; next row
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop8x8
|
||||
|
||||
; return stuff
|
||||
ldr r8, [sp, #32] ; get address of sse
|
||||
mul r1, r4, r4 ; sum * sum
|
||||
str r5, [r8] ; store sse
|
||||
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
|
||||
|
||||
pop {r4-r10, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
;
|
||||
;note: Based on aom_variance16x16_media. In this function, sum is never used.
|
||||
; So, we can remove this part of calculation.
|
||||
|
||||
|aom_mse16x16_media| PROC
|
||||
|
||||
push {r4-r9, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov r4, #0 ; initialize sse = 0
|
||||
|
||||
loopmse
|
||||
; 1st 4 pixels
|
||||
ldr r5, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r6, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0x4] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
ldr r5, [r0, #0x8] ; load 4 src pixels
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r6, [r2, #0x8] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0xc] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r6, [r2, #0xc] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
subs r12, r12, #1 ; next row
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loopmse
|
||||
|
||||
; return stuff
|
||||
ldr r1, [sp, #28] ; get address of sse
|
||||
mov r0, r4 ; return sse
|
||||
str r4, [r1] ; store sse
|
||||
|
||||
pop {r4-r9, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
@@ -1,240 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_BITREADER_H_
|
||||
#define AOM_DSP_BITREADER_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "./aom_config.h"
|
||||
#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
|
||||
#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
|
||||
#endif
|
||||
|
||||
#include "aom/aomdx.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#if CONFIG_ANS
|
||||
#include "aom_dsp/ansreader.h"
|
||||
#elif CONFIG_DAALA_EC
|
||||
#include "aom_dsp/daalaboolreader.h"
|
||||
#else
|
||||
#include "aom_dsp/dkboolreader.h"
|
||||
#endif
|
||||
#include "aom_dsp/prob.h"
|
||||
#include "av1/common/odintrin.h"
|
||||
|
||||
#if CONFIG_ACCOUNTING
|
||||
#include "av1/common/accounting.h"
|
||||
#define ACCT_STR_NAME acct_str
|
||||
#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
|
||||
#define ACCT_STR_ARG(s) , s
|
||||
#else
|
||||
#define ACCT_STR_PARAM
|
||||
#define ACCT_STR_ARG(s)
|
||||
#endif
|
||||
|
||||
#define aom_read(r, prob, ACCT_STR_NAME) \
|
||||
aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
|
||||
#define aom_read_bit(r, ACCT_STR_NAME) \
|
||||
aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
|
||||
#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
|
||||
aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
|
||||
#define aom_read_literal(r, bits, ACCT_STR_NAME) \
|
||||
aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
|
||||
#define aom_read_tree_bits(r, tree, probs, ACCT_STR_NAME) \
|
||||
aom_read_tree_bits_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
|
||||
#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
|
||||
aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if CONFIG_ANS
|
||||
typedef struct AnsDecoder aom_reader;
|
||||
#elif CONFIG_DAALA_EC
|
||||
typedef struct daala_reader aom_reader;
|
||||
#else
|
||||
typedef struct aom_dk_reader aom_reader;
|
||||
#endif
|
||||
|
||||
static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
|
||||
size_t size, aom_decrypt_cb decrypt_cb,
|
||||
void *decrypt_state) {
|
||||
#if CONFIG_ANS
|
||||
(void)decrypt_cb;
|
||||
(void)decrypt_state;
|
||||
assert(size <= INT_MAX);
|
||||
return ans_read_init(r, buffer, size);
|
||||
#elif CONFIG_DAALA_EC
|
||||
(void)decrypt_cb;
|
||||
(void)decrypt_state;
|
||||
return aom_daala_reader_init(r, buffer, size);
|
||||
#else
|
||||
return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
|
||||
#if CONFIG_ANS
|
||||
(void)r;
|
||||
assert(0 && "Use the raw buffer size with ANS");
|
||||
return NULL;
|
||||
#elif CONFIG_DAALA_EC
|
||||
return aom_daala_reader_find_end(r);
|
||||
#else
|
||||
return aom_dk_reader_find_end(r);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE int aom_reader_has_error(aom_reader *r) {
|
||||
#if CONFIG_ANS
|
||||
return ans_reader_has_error(r);
|
||||
#elif CONFIG_DAALA_EC
|
||||
return aom_daala_reader_has_error(r);
|
||||
#else
|
||||
return aom_dk_reader_has_error(r);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Returns the position in the bit reader in bits.
|
||||
static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
|
||||
#if CONFIG_ANS
|
||||
(void)r;
|
||||
assert(0 && "aom_reader_tell() is unimplemented for ANS");
|
||||
return 0;
|
||||
#elif CONFIG_DAALA_EC
|
||||
return aom_daala_reader_tell(r);
|
||||
#else
|
||||
return aom_dk_reader_tell(r);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Returns the position in the bit reader in 1/8th bits.
|
||||
static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
|
||||
#if CONFIG_ANS
|
||||
(void)r;
|
||||
assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
|
||||
return 0;
|
||||
#elif CONFIG_DAALA_EC
|
||||
return aom_daala_reader_tell_frac(r);
|
||||
#else
|
||||
return aom_dk_reader_tell_frac(r);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if CONFIG_ACCOUNTING
|
||||
static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
|
||||
if (r->accounting != NULL) {
|
||||
uint32_t tell_frac;
|
||||
tell_frac = aom_reader_tell_frac(r);
|
||||
aom_accounting_record(r->accounting, ACCT_STR_NAME,
|
||||
tell_frac - r->accounting->last_tell_frac);
|
||||
r->accounting->last_tell_frac = tell_frac;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
|
||||
int ret;
|
||||
#if CONFIG_ANS
|
||||
ret = uabs_read(r, prob);
|
||||
#elif CONFIG_DAALA_EC
|
||||
ret = aom_daala_read(r, prob);
|
||||
#else
|
||||
ret = aom_dk_read(r, prob);
|
||||
#endif
|
||||
#if CONFIG_ACCOUNTING
|
||||
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
|
||||
int ret;
|
||||
#if CONFIG_ANS
|
||||
ret = uabs_read_bit(r); // Non trivial optimization at half probability
|
||||
#else
|
||||
ret = aom_read(r, 128, NULL); // aom_prob_half
|
||||
#endif
|
||||
#if CONFIG_ACCOUNTING
|
||||
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
|
||||
int literal = 0, bit;
|
||||
|
||||
for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
|
||||
#if CONFIG_ACCOUNTING
|
||||
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
|
||||
#endif
|
||||
return literal;
|
||||
}
|
||||
|
||||
static INLINE int aom_read_tree_bits_(aom_reader *r, const aom_tree_index *tree,
|
||||
const aom_prob *probs ACCT_STR_PARAM) {
|
||||
aom_tree_index i = 0;
|
||||
|
||||
while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue;
|
||||
#if CONFIG_ACCOUNTING
|
||||
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
|
||||
#endif
|
||||
return -i;
|
||||
}
|
||||
|
||||
static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
|
||||
const aom_prob *probs ACCT_STR_PARAM) {
|
||||
int ret;
|
||||
#if CONFIG_DAALA_EC
|
||||
ret = daala_read_tree_bits(r, tree, probs);
|
||||
#else
|
||||
ret = aom_read_tree_bits(r, tree, probs, NULL);
|
||||
#endif
|
||||
#if CONFIG_ACCOUNTING
|
||||
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if CONFIG_EC_MULTISYMBOL
|
||||
static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
|
||||
int nsymbs ACCT_STR_PARAM) {
|
||||
int ret;
|
||||
#if CONFIG_RANS
|
||||
(void)nsymbs;
|
||||
ret = rans_read(r, cdf);
|
||||
#elif CONFIG_DAALA_EC
|
||||
ret = daala_read_symbol(r, cdf, nsymbs);
|
||||
#else
|
||||
#error \
|
||||
"CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
|
||||
"coder. Enable daala_ec or ans for a valid configuration."
|
||||
#endif
|
||||
|
||||
#if CONFIG_EC_ADAPT
|
||||
update_cdf(cdf, ret, nsymbs);
|
||||
#endif
|
||||
|
||||
#if CONFIG_ACCOUNTING
|
||||
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
#endif // CONFIG_EC_MULTISYMBOL
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_BITREADER_H_
|
||||
@@ -1,47 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#include "./aom_config.h"
|
||||
#include "./bitreader_buffer.h"
|
||||
|
||||
size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) {
|
||||
return (rb->bit_offset + 7) >> 3;
|
||||
}
|
||||
|
||||
int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
|
||||
const size_t off = rb->bit_offset;
|
||||
const size_t p = off >> 3;
|
||||
const int q = 7 - (int)(off & 0x7);
|
||||
if (rb->bit_buffer + p < rb->bit_buffer_end) {
|
||||
const int bit = (rb->bit_buffer[p] >> q) & 1;
|
||||
rb->bit_offset = off + 1;
|
||||
return bit;
|
||||
} else {
|
||||
rb->error_handler(rb->error_handler_data);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
|
||||
int value = 0, bit;
|
||||
for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
|
||||
return value;
|
||||
}
|
||||
|
||||
int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
|
||||
const int value = aom_rb_read_literal(rb, bits);
|
||||
return aom_rb_read_bit(rb) ? -value : value;
|
||||
}
|
||||
|
||||
int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
|
||||
const int nbits = sizeof(unsigned) * 8 - bits - 1;
|
||||
const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
|
||||
return ((int)value) >> nbits;
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_BITREADER_BUFFER_H_
|
||||
#define AOM_DSP_BITREADER_BUFFER_H_
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void (*aom_rb_error_handler)(void *data);
|
||||
|
||||
struct aom_read_bit_buffer {
|
||||
const uint8_t *bit_buffer;
|
||||
const uint8_t *bit_buffer_end;
|
||||
size_t bit_offset;
|
||||
|
||||
void *error_handler_data;
|
||||
aom_rb_error_handler error_handler;
|
||||
};
|
||||
|
||||
size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb);
|
||||
|
||||
int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
|
||||
|
||||
int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
|
||||
|
||||
int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits);
|
||||
|
||||
int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_BITREADER_BUFFER_H_
|
||||
@@ -1,179 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_BITWRITER_H_
|
||||
#define AOM_DSP_BITWRITER_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include "./aom_config.h"
|
||||
#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
|
||||
#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
|
||||
#endif
|
||||
|
||||
#if CONFIG_ANS
|
||||
#include "aom_dsp/buf_ans.h"
|
||||
#elif CONFIG_DAALA_EC
|
||||
#include "aom_dsp/daalaboolwriter.h"
|
||||
#else
|
||||
#include "aom_dsp/dkboolwriter.h"
|
||||
#endif
|
||||
#include "aom_dsp/prob.h"
|
||||
|
||||
#if CONFIG_RD_DEBUG
|
||||
#include "av1/encoder/cost.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if CONFIG_ANS
|
||||
typedef struct BufAnsCoder aom_writer;
|
||||
#elif CONFIG_DAALA_EC
|
||||
typedef struct daala_writer aom_writer;
|
||||
#else
|
||||
typedef struct aom_dk_writer aom_writer;
|
||||
#endif
|
||||
|
||||
typedef struct TOKEN_STATS { int64_t cost; } TOKEN_STATS;
|
||||
|
||||
static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
|
||||
#if CONFIG_ANS
|
||||
(void)bc;
|
||||
(void)buffer;
|
||||
assert(0 && "buf_ans requires a more complicated startup procedure");
|
||||
#elif CONFIG_DAALA_EC
|
||||
aom_daala_start_encode(bc, buffer);
|
||||
#else
|
||||
aom_dk_start_encode(bc, buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE void aom_stop_encode(aom_writer *bc) {
|
||||
#if CONFIG_ANS
|
||||
(void)bc;
|
||||
assert(0 && "buf_ans requires a more complicated shutdown procedure");
|
||||
#elif CONFIG_DAALA_EC
|
||||
aom_daala_stop_encode(bc);
|
||||
#else
|
||||
aom_dk_stop_encode(bc);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE void aom_write(aom_writer *br, int bit, int probability) {
|
||||
#if CONFIG_ANS
|
||||
buf_uabs_write(br, bit, probability);
|
||||
#elif CONFIG_DAALA_EC
|
||||
aom_daala_write(br, bit, probability);
|
||||
#else
|
||||
aom_dk_write(br, bit, probability);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
|
||||
TOKEN_STATS *token_stats) {
|
||||
aom_write(br, bit, probability);
|
||||
#if CONFIG_RD_DEBUG
|
||||
token_stats->cost += av1_cost_bit(probability, bit);
|
||||
#else
|
||||
(void)token_stats;
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE void aom_write_bit(aom_writer *w, int bit) {
|
||||
aom_write(w, bit, 128); // aom_prob_half
|
||||
}
|
||||
|
||||
static INLINE void aom_write_bit_record(aom_writer *w, int bit,
|
||||
TOKEN_STATS *token_stats) {
|
||||
aom_write_record(w, bit, 128, token_stats); // aom_prob_half
|
||||
}
|
||||
|
||||
static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
|
||||
int bit;
|
||||
|
||||
for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
|
||||
}
|
||||
|
||||
static INLINE void aom_write_tree_bits(aom_writer *w, const aom_tree_index *tr,
|
||||
const aom_prob *probs, int bits, int len,
|
||||
aom_tree_index i) {
|
||||
do {
|
||||
const int bit = (bits >> --len) & 1;
|
||||
aom_write(w, bit, probs[i >> 1]);
|
||||
i = tr[i + bit];
|
||||
} while (len);
|
||||
}
|
||||
|
||||
static INLINE void aom_write_tree_bits_record(aom_writer *w,
|
||||
const aom_tree_index *tr,
|
||||
const aom_prob *probs, int bits,
|
||||
int len, aom_tree_index i,
|
||||
TOKEN_STATS *token_stats) {
|
||||
do {
|
||||
const int bit = (bits >> --len) & 1;
|
||||
aom_write_record(w, bit, probs[i >> 1], token_stats);
|
||||
i = tr[i + bit];
|
||||
} while (len);
|
||||
}
|
||||
|
||||
static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
|
||||
const aom_prob *probs, int bits, int len,
|
||||
aom_tree_index i) {
|
||||
#if CONFIG_DAALA_EC
|
||||
daala_write_tree_bits(w, tree, probs, bits, len, i);
|
||||
#else
|
||||
aom_write_tree_bits(w, tree, probs, bits, len, i);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE void aom_write_tree_record(aom_writer *w,
|
||||
const aom_tree_index *tree,
|
||||
const aom_prob *probs, int bits,
|
||||
int len, aom_tree_index i,
|
||||
TOKEN_STATS *token_stats) {
|
||||
#if CONFIG_DAALA_EC
|
||||
(void)token_stats;
|
||||
daala_write_tree_bits(w, tree, probs, bits, len, i);
|
||||
#else
|
||||
aom_write_tree_bits_record(w, tree, probs, bits, len, i, token_stats);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if CONFIG_EC_MULTISYMBOL
|
||||
static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
|
||||
int nsymbs) {
|
||||
#if CONFIG_RANS
|
||||
struct rans_sym s;
|
||||
(void)nsymbs;
|
||||
assert(cdf);
|
||||
s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
|
||||
s.prob = cdf[symb] - s.cum_prob;
|
||||
buf_rans_write(w, &s);
|
||||
#elif CONFIG_DAALA_EC
|
||||
daala_write_symbol(w, symb, cdf, nsymbs);
|
||||
#else
|
||||
#error \
|
||||
"CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
|
||||
"coder. Enable daala_ec or ans for a valid configuration."
|
||||
#endif
|
||||
|
||||
#if CONFIG_EC_ADAPT
|
||||
update_cdf(cdf, symb, nsymbs);
|
||||
#endif
|
||||
}
|
||||
#endif // CONFIG_EC_MULTISYMBOL
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_BITWRITER_H_
|
||||
@@ -1,43 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./bitwriter_buffer.h"
|
||||
|
||||
size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
|
||||
return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
|
||||
}
|
||||
|
||||
void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
|
||||
const int off = (int)wb->bit_offset;
|
||||
const int p = off / CHAR_BIT;
|
||||
const int q = CHAR_BIT - 1 - off % CHAR_BIT;
|
||||
if (q == CHAR_BIT - 1) {
|
||||
wb->bit_buffer[p] = bit << q;
|
||||
} else {
|
||||
wb->bit_buffer[p] &= ~(1 << q);
|
||||
wb->bit_buffer[p] |= bit << q;
|
||||
}
|
||||
wb->bit_offset = off + 1;
|
||||
}
|
||||
|
||||
void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
|
||||
int bit;
|
||||
for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
|
||||
}
|
||||
|
||||
void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
|
||||
int bits) {
|
||||
aom_wb_write_literal(wb, data, bits + 1);
|
||||
}
|
||||
@@ -1,39 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_BITWRITER_BUFFER_H_
|
||||
#define AOM_DSP_BITWRITER_BUFFER_H_
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct aom_write_bit_buffer {
|
||||
uint8_t *bit_buffer;
|
||||
size_t bit_offset;
|
||||
};
|
||||
|
||||
size_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
|
||||
|
||||
void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
|
||||
|
||||
void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
|
||||
|
||||
void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
|
||||
int bits);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_BITWRITER_BUFFER_H_
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_BLEND_H_
|
||||
#define AOM_DSP_BLEND_H_
|
||||
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
// Various blending functions and macros.
|
||||
// See also the aom_blend_* functions in aom_dsp_rtcd.h
|
||||
|
||||
// Alpha blending with alpha values from the range [0, 64], where 64
|
||||
// means use the first input and 0 means use the second input.
|
||||
|
||||
#define AOM_BLEND_A64_ROUND_BITS 6
|
||||
#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64
|
||||
|
||||
#define AOM_BLEND_A64(a, v0, v1) \
|
||||
ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
|
||||
AOM_BLEND_A64_ROUND_BITS)
|
||||
|
||||
// Alpha blending with alpha values from the range [0, 256], where 256
|
||||
// means use the first input and 0 means use the second input.
|
||||
#define AOM_BLEND_A256_ROUND_BITS 8
|
||||
#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256
|
||||
|
||||
#define AOM_BLEND_A256(a, v0, v1) \
|
||||
ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
|
||||
AOM_BLEND_A256_ROUND_BITS)
|
||||
|
||||
// Blending by averaging.
|
||||
#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
|
||||
|
||||
#endif // AOM_DSP_BLEND_H_
|
||||
@@ -1,71 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/blend.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
int i, j;
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(
|
||||
mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
|
||||
const uint8_t *src0_8, uint32_t src0_stride,
|
||||
const uint8_t *src1_8, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w, int bd) {
|
||||
int i, j;
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
|
||||
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
|
||||
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
|
||||
(void)bd;
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
assert(bd == 8 || bd == 10 || bd == 12);
|
||||
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(
|
||||
mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
@@ -1,145 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom_dsp/blend.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
// Blending with alpha mask. Mask values come from the range [0, 64],
|
||||
// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
|
||||
// be the same as dst, or dst can be different from both sources.
|
||||
|
||||
void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h,
|
||||
int w, int subh, int subw) {
|
||||
int i, j;
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
if (subw == 0 && subh == 0) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = mask[i * mask_stride + j];
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
} else if (subw == 1 && subh == 1) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = ROUND_POWER_OF_TWO(
|
||||
mask[(2 * i) * mask_stride + (2 * j)] +
|
||||
mask[(2 * i + 1) * mask_stride + (2 * j)] +
|
||||
mask[(2 * i) * mask_stride + (2 * j + 1)] +
|
||||
mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
|
||||
2);
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
} else if (subw == 1 && subh == 0) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
|
||||
mask[i * mask_stride + (2 * j + 1)]);
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
|
||||
mask[(2 * i + 1) * mask_stride + j]);
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
|
||||
const uint8_t *src0_8, uint32_t src0_stride,
|
||||
const uint8_t *src1_8, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride,
|
||||
int h, int w, int subh, int subw, int bd) {
|
||||
int i, j;
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
|
||||
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
|
||||
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
|
||||
(void)bd;
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
assert(bd == 8 || bd == 10 || bd == 12);
|
||||
|
||||
if (subw == 0 && subh == 0) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = mask[i * mask_stride + j];
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
} else if (subw == 1 && subh == 1) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = ROUND_POWER_OF_TWO(
|
||||
mask[(2 * i) * mask_stride + (2 * j)] +
|
||||
mask[(2 * i + 1) * mask_stride + (2 * j)] +
|
||||
mask[(2 * i) * mask_stride + (2 * j + 1)] +
|
||||
mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
|
||||
2);
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
} else if (subw == 1 && subh == 0) {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
|
||||
mask[i * mask_stride + (2 * j + 1)]);
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < h; ++i) {
|
||||
for (j = 0; j < w; ++j) {
|
||||
const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
|
||||
mask[(2 * i + 1) * mask_stride + j]);
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
@@ -1,73 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/blend.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
int i, j;
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
for (i = 0; i < h; ++i) {
|
||||
const int m = mask[i];
|
||||
for (j = 0; j < w; ++j) {
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
|
||||
const uint8_t *src0_8, uint32_t src0_stride,
|
||||
const uint8_t *src1_8, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w, int bd) {
|
||||
int i, j;
|
||||
uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
|
||||
const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
|
||||
const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
|
||||
(void)bd;
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
assert(bd == 8 || bd == 10 || bd == 12);
|
||||
|
||||
for (i = 0; i < h; ++i) {
|
||||
const int m = mask[i];
|
||||
for (j = 0; j < w; ++j) {
|
||||
dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
|
||||
src1[i * src1_stride + j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "aom_dsp/buf_ans.h"
|
||||
#include "aom_mem/aom_mem.h"
|
||||
#include "aom/internal/aom_codec_internal.h"
|
||||
|
||||
void aom_buf_ans_alloc(struct BufAnsCoder *c,
|
||||
struct aom_internal_error_info *error, int size_hint) {
|
||||
c->error = error;
|
||||
c->size = size_hint;
|
||||
AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
|
||||
// Initialize to overfull to trigger the assert in write.
|
||||
c->offset = c->size + 1;
|
||||
}
|
||||
|
||||
void aom_buf_ans_free(struct BufAnsCoder *c) {
|
||||
aom_free(c->buf);
|
||||
c->buf = NULL;
|
||||
c->size = 0;
|
||||
}
|
||||
|
||||
void aom_buf_ans_grow(struct BufAnsCoder *c) {
|
||||
struct buffered_ans_symbol *new_buf = NULL;
|
||||
int new_size = c->size * 2;
|
||||
AOM_CHECK_MEM_ERROR(c->error, new_buf,
|
||||
aom_malloc(new_size * sizeof(*new_buf)));
|
||||
memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
|
||||
aom_free(c->buf);
|
||||
c->buf = new_buf;
|
||||
c->size = new_size;
|
||||
}
|
||||
@@ -1,112 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_BUF_ANS_H_
|
||||
#define AOM_DSP_BUF_ANS_H_
|
||||
// Buffered forward ANS writer.
|
||||
// Symbols are written to the writer in forward (decode) order and serialized
|
||||
// backwards due to ANS's stack like behavior.
|
||||
|
||||
#include <assert.h>
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/ans.h"
|
||||
#include "aom_dsp/answriter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif // __cplusplus
|
||||
|
||||
#define ANS_METHOD_UABS 0
|
||||
#define ANS_METHOD_RANS 1
|
||||
|
||||
struct buffered_ans_symbol {
|
||||
unsigned int method : 1; // one of ANS_METHOD_UABS or ANS_METHOD_RANS
|
||||
// TODO(aconverse): Should be possible to write this in terms of start for ABS
|
||||
unsigned int val_start : RANS_PROB_BITS; // Boolean value for ABS
|
||||
// start in symbol cycle for Rans
|
||||
unsigned int prob : RANS_PROB_BITS; // Probability of this symbol
|
||||
};
|
||||
|
||||
struct BufAnsCoder {
|
||||
struct aom_internal_error_info *error;
|
||||
struct buffered_ans_symbol *buf;
|
||||
int size;
|
||||
int offset;
|
||||
};
|
||||
|
||||
void aom_buf_ans_alloc(struct BufAnsCoder *c,
|
||||
struct aom_internal_error_info *error, int size_hint);
|
||||
|
||||
void aom_buf_ans_free(struct BufAnsCoder *c);
|
||||
|
||||
void aom_buf_ans_grow(struct BufAnsCoder *c);
|
||||
|
||||
static INLINE void buf_ans_write_reset(struct BufAnsCoder *const c) {
|
||||
c->offset = 0;
|
||||
}
|
||||
|
||||
static INLINE void buf_uabs_write(struct BufAnsCoder *const c, uint8_t val,
|
||||
AnsP8 prob) {
|
||||
assert(c->offset <= c->size);
|
||||
if (c->offset == c->size) {
|
||||
aom_buf_ans_grow(c);
|
||||
}
|
||||
c->buf[c->offset].method = ANS_METHOD_UABS;
|
||||
c->buf[c->offset].val_start = val;
|
||||
c->buf[c->offset].prob = prob;
|
||||
++c->offset;
|
||||
}
|
||||
|
||||
static INLINE void buf_rans_write(struct BufAnsCoder *const c,
|
||||
const struct rans_sym *const sym) {
|
||||
assert(c->offset <= c->size);
|
||||
if (c->offset == c->size) {
|
||||
aom_buf_ans_grow(c);
|
||||
}
|
||||
c->buf[c->offset].method = ANS_METHOD_RANS;
|
||||
c->buf[c->offset].val_start = sym->cum_prob;
|
||||
c->buf[c->offset].prob = sym->prob;
|
||||
++c->offset;
|
||||
}
|
||||
|
||||
static INLINE void buf_ans_flush(const struct BufAnsCoder *const c,
|
||||
struct AnsCoder *ans) {
|
||||
int offset;
|
||||
for (offset = c->offset - 1; offset >= 0; --offset) {
|
||||
if (c->buf[offset].method == ANS_METHOD_RANS) {
|
||||
struct rans_sym sym;
|
||||
sym.prob = c->buf[offset].prob;
|
||||
sym.cum_prob = c->buf[offset].val_start;
|
||||
rans_write(ans, &sym);
|
||||
} else {
|
||||
uabs_write(ans, (uint8_t)c->buf[offset].val_start,
|
||||
(AnsP8)c->buf[offset].prob);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void buf_uabs_write_bit(struct BufAnsCoder *c, int bit) {
|
||||
buf_uabs_write(c, bit, 128);
|
||||
}
|
||||
|
||||
static INLINE void buf_uabs_write_literal(struct BufAnsCoder *c, int literal,
|
||||
int bits) {
|
||||
int bit;
|
||||
|
||||
assert(bits < 31);
|
||||
for (bit = bits - 1; bit >= 0; bit--)
|
||||
buf_uabs_write_bit(c, 1 & (literal >> bit));
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif // __cplusplus
|
||||
#endif // AOM_DSP_BUF_ANS_H_
|
||||
@@ -1,37 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "aom_dsp/daalaboolreader.h"
|
||||
|
||||
int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
|
||||
if (size && !buffer) {
|
||||
return 1;
|
||||
}
|
||||
r->buffer_end = buffer + size;
|
||||
r->buffer = buffer;
|
||||
od_ec_dec_init(&r->ec, buffer, size - 1);
|
||||
#if CONFIG_ACCOUNTING
|
||||
r->accounting = NULL;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
|
||||
return r->buffer_end;
|
||||
}
|
||||
|
||||
uint32_t aom_daala_reader_tell(const daala_reader *r) {
|
||||
return od_ec_dec_tell(&r->ec);
|
||||
}
|
||||
|
||||
uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
|
||||
return od_ec_dec_tell_frac(&r->ec);
|
||||
}
|
||||
@@ -1,87 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_DAALABOOLREADER_H_
|
||||
#define AOM_DSP_DAALABOOLREADER_H_
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/entdec.h"
|
||||
#include "aom_dsp/prob.h"
|
||||
#if CONFIG_ACCOUNTING
|
||||
#include "av1/common/accounting.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct daala_reader {
|
||||
const uint8_t *buffer;
|
||||
const uint8_t *buffer_end;
|
||||
od_ec_dec ec;
|
||||
#if CONFIG_ACCOUNTING
|
||||
Accounting *accounting;
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef struct daala_reader daala_reader;
|
||||
|
||||
int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
|
||||
const uint8_t *aom_daala_reader_find_end(daala_reader *r);
|
||||
uint32_t aom_daala_reader_tell(const daala_reader *r);
|
||||
uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
|
||||
|
||||
static INLINE int aom_daala_read(daala_reader *r, int prob) {
|
||||
if (prob == 128) {
|
||||
return od_ec_dec_bits(&r->ec, 1, "aom_bits");
|
||||
} else {
|
||||
int p = ((prob << 15) + (256 - prob)) >> 8;
|
||||
return od_ec_decode_bool_q15(&r->ec, p);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE int aom_daala_read_bit(daala_reader *r) {
|
||||
return aom_daala_read(r, 128);
|
||||
}
|
||||
|
||||
static INLINE int aom_daala_reader_has_error(daala_reader *r) {
|
||||
return r->ec.error;
|
||||
}
|
||||
|
||||
static INLINE int daala_read_tree_bits(daala_reader *r,
|
||||
const aom_tree_index *tree,
|
||||
const aom_prob *probs) {
|
||||
aom_tree_index i = 0;
|
||||
do {
|
||||
aom_cdf_prob cdf[16];
|
||||
aom_tree_index index[16];
|
||||
int path[16];
|
||||
int dist[16];
|
||||
int nsymbs;
|
||||
int symb;
|
||||
nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist);
|
||||
symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
|
||||
OD_ASSERT(symb >= 0 && symb < nsymbs);
|
||||
i = index[symb];
|
||||
} while (i > 0);
|
||||
return -i;
|
||||
}
|
||||
|
||||
static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
|
||||
int nsymbs) {
|
||||
return od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,32 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include "aom_dsp/daalaboolwriter.h"
|
||||
|
||||
void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
|
||||
br->buffer = source;
|
||||
br->pos = 0;
|
||||
od_ec_enc_init(&br->ec, 62025);
|
||||
}
|
||||
|
||||
void aom_daala_stop_encode(daala_writer *br) {
|
||||
uint32_t daala_bytes;
|
||||
unsigned char *daala_data;
|
||||
daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
|
||||
memcpy(br->buffer, daala_data, daala_bytes);
|
||||
br->pos = daala_bytes;
|
||||
/* Prevent ec bitstream from being detected as a superframe marker.
|
||||
Must always be added, so that rawbits knows the exact length of the
|
||||
bitstream. */
|
||||
br->buffer[br->pos++] = 0;
|
||||
od_ec_enc_clear(&br->ec);
|
||||
}
|
||||
@@ -1,90 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_DAALABOOLWRITER_H_
|
||||
#define AOM_DSP_DAALABOOLWRITER_H_
|
||||
|
||||
#include "aom_dsp/entenc.h"
|
||||
#include "aom_dsp/prob.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct daala_writer {
|
||||
unsigned int pos;
|
||||
uint8_t *buffer;
|
||||
od_ec_enc ec;
|
||||
};
|
||||
|
||||
typedef struct daala_writer daala_writer;
|
||||
|
||||
void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
|
||||
void aom_daala_stop_encode(daala_writer *w);
|
||||
|
||||
static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
|
||||
if (prob == 128) {
|
||||
od_ec_enc_bits(&w->ec, bit, 1);
|
||||
} else {
|
||||
int p = ((prob << 15) + (256 - prob)) >> 8;
|
||||
od_ec_encode_bool_q15(&w->ec, bit, p);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void daala_write_tree_bits(daala_writer *w,
|
||||
const aom_tree_index *tree,
|
||||
const aom_prob *probs, int bits,
|
||||
int len, aom_tree_index i) {
|
||||
aom_tree_index root;
|
||||
root = i;
|
||||
do {
|
||||
aom_cdf_prob cdf[16];
|
||||
aom_tree_index index[16];
|
||||
int path[16];
|
||||
int dist[16];
|
||||
int nsymbs;
|
||||
int symb;
|
||||
int j;
|
||||
/* Compute the CDF of the binary tree using the given probabilities. */
|
||||
nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist);
|
||||
/* Find the symbol to code. */
|
||||
symb = -1;
|
||||
for (j = 0; j < nsymbs; j++) {
|
||||
/* If this symbol codes a leaf node, */
|
||||
if (index[j] <= 0) {
|
||||
if (len == dist[j] && path[j] == bits) {
|
||||
symb = j;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (len > dist[j] && path[j] == bits >> (len - dist[j])) {
|
||||
symb = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
OD_ASSERT(symb != -1);
|
||||
od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
|
||||
bits &= (1 << (len - dist[symb])) - 1;
|
||||
len -= dist[symb];
|
||||
} while (len);
|
||||
}
|
||||
|
||||
static INLINE void daala_write_symbol(daala_writer *w, int symb,
|
||||
const aom_cdf_prob *cdf, int nsymbs) {
|
||||
od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,180 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_DKBOOLREADER_H_
|
||||
#define AOM_DSP_DKBOOLREADER_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "./aom_config.h"
|
||||
#if CONFIG_BITSTREAM_DEBUG
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include "aom_util/debug_util.h"
|
||||
#endif // CONFIG_BITSTREAM_DEBUG
|
||||
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom/aomdx.h"
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_dsp/prob.h"
|
||||
#if CONFIG_ACCOUNTING
|
||||
#include "av1/common/accounting.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef size_t BD_VALUE;
|
||||
|
||||
#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
|
||||
|
||||
// This is meant to be a large, positive constant that can still be efficiently
|
||||
// loaded as an immediate (on platforms like ARM, for example).
|
||||
// Even relatively modest values like 100 would work fine.
|
||||
#define LOTS_OF_BITS 0x40000000
|
||||
|
||||
struct aom_dk_reader {
|
||||
// Be careful when reordering this struct, it may impact the cache negatively.
|
||||
BD_VALUE value;
|
||||
unsigned int range;
|
||||
int count;
|
||||
const uint8_t *buffer_start;
|
||||
const uint8_t *buffer_end;
|
||||
const uint8_t *buffer;
|
||||
aom_decrypt_cb decrypt_cb;
|
||||
void *decrypt_state;
|
||||
uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
|
||||
#if CONFIG_ACCOUNTING
|
||||
Accounting *accounting;
|
||||
#endif
|
||||
};
|
||||
|
||||
int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer,
|
||||
size_t size, aom_decrypt_cb decrypt_cb,
|
||||
void *decrypt_state);
|
||||
|
||||
void aom_dk_reader_fill(struct aom_dk_reader *r);
|
||||
|
||||
const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r);
|
||||
|
||||
static INLINE uint32_t aom_dk_reader_tell(const struct aom_dk_reader *r) {
|
||||
const uint32_t bits_read = (r->buffer - r->buffer_start) * CHAR_BIT;
|
||||
const int count =
|
||||
(r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS;
|
||||
assert(r->buffer >= r->buffer_start);
|
||||
return bits_read - (count + CHAR_BIT);
|
||||
}
|
||||
|
||||
/*The resolution of fractional-precision bit usage measurements, i.e.,
|
||||
3 => 1/8th bits.*/
|
||||
#define DK_BITRES (3)
|
||||
|
||||
static INLINE uint32_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) {
|
||||
uint32_t num_bits;
|
||||
uint32_t range;
|
||||
int l;
|
||||
int i;
|
||||
num_bits = aom_dk_reader_tell(r) << DK_BITRES;
|
||||
range = r->range;
|
||||
l = 0;
|
||||
for (i = DK_BITRES; i-- > 0;) {
|
||||
int b;
|
||||
range = range * range >> 7;
|
||||
b = (int)(range >> 8);
|
||||
l = l << 1 | b;
|
||||
range >>= b;
|
||||
}
|
||||
return num_bits - l;
|
||||
}
|
||||
|
||||
static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) {
|
||||
// Check if we have reached the end of the buffer.
|
||||
//
|
||||
// Variable 'count' stores the number of bits in the 'value' buffer, minus
|
||||
// 8. The top byte is part of the algorithm, and the remainder is buffered
|
||||
// to be shifted into it. So if count == 8, the top 16 bits of 'value' are
|
||||
// occupied, 8 for the algorithm and 8 in the buffer.
|
||||
//
|
||||
// When reading a byte from the user's buffer, count is filled with 8 and
|
||||
// one byte is filled into the value buffer. When we reach the end of the
|
||||
// data, count is additionally filled with LOTS_OF_BITS. So when
|
||||
// count == LOTS_OF_BITS - 1, the user's data has been exhausted.
|
||||
//
|
||||
// 1 if we have tried to decode bits after the end of stream was encountered.
|
||||
// 0 No error.
|
||||
return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
|
||||
}
|
||||
|
||||
static INLINE int aom_dk_read(struct aom_dk_reader *r, int prob) {
|
||||
unsigned int bit = 0;
|
||||
BD_VALUE value;
|
||||
BD_VALUE bigsplit;
|
||||
int count;
|
||||
unsigned int range;
|
||||
unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
|
||||
|
||||
if (r->count < 0) aom_dk_reader_fill(r);
|
||||
|
||||
value = r->value;
|
||||
count = r->count;
|
||||
|
||||
bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
|
||||
|
||||
range = split;
|
||||
|
||||
if (value >= bigsplit) {
|
||||
range = r->range - split;
|
||||
value = value - bigsplit;
|
||||
bit = 1;
|
||||
}
|
||||
|
||||
{
|
||||
register int shift = aom_norm[range];
|
||||
range <<= shift;
|
||||
value <<= shift;
|
||||
count -= shift;
|
||||
}
|
||||
r->value = value;
|
||||
r->count = count;
|
||||
r->range = range;
|
||||
|
||||
#if CONFIG_BITSTREAM_DEBUG
|
||||
{
|
||||
int ref_bit, ref_prob;
|
||||
const int queue_r = bitstream_queue_get_read();
|
||||
const int frame_idx = bitstream_queue_get_frame_read();
|
||||
bitstream_queue_pop(&ref_bit, &ref_prob);
|
||||
if (prob != ref_prob) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n",
|
||||
frame_idx, prob, ref_prob, queue_r);
|
||||
assert(0);
|
||||
}
|
||||
if ((int)bit != ref_bit) {
|
||||
fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n",
|
||||
frame_idx, bit, ref_bit);
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_BITSTREAM_DEBUG
|
||||
|
||||
return bit;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_DKBOOLREADER_H_
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "./dkboolwriter.h"
|
||||
|
||||
static INLINE void aom_dk_write_bit(aom_dk_writer *w, int bit) {
|
||||
aom_dk_write(w, bit, 128); // aom_prob_half
|
||||
}
|
||||
|
||||
void aom_dk_start_encode(aom_dk_writer *br, uint8_t *source) {
|
||||
br->lowvalue = 0;
|
||||
br->range = 255;
|
||||
br->count = -24;
|
||||
br->buffer = source;
|
||||
br->pos = 0;
|
||||
aom_dk_write_bit(br, 0);
|
||||
}
|
||||
|
||||
void aom_dk_stop_encode(aom_dk_writer *br) {
|
||||
int i;
|
||||
|
||||
#if CONFIG_BITSTREAM_DEBUG
|
||||
bitstream_queue_set_skip_write(1);
|
||||
#endif // CONFIG_BITSTREAM_DEBUG
|
||||
|
||||
for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0);
|
||||
|
||||
#if CONFIG_BITSTREAM_DEBUG
|
||||
bitstream_queue_set_skip_write(0);
|
||||
#endif // CONFIG_BITSTREAM_DEBUG
|
||||
|
||||
// Ensure there's no ambigous collision with any index marker bytes
|
||||
if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
|
||||
}
|
||||
@@ -1,104 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_DKBOOLWRITER_H_
|
||||
#define AOM_DSP_DKBOOLWRITER_H_
|
||||
|
||||
#include "./aom_config.h"
|
||||
|
||||
#if CONFIG_BITSTREAM_DEBUG
|
||||
#include <stdio.h>
|
||||
#include "aom_util/debug_util.h"
|
||||
#endif // CONFIG_BITSTREAM_DEBUG
|
||||
|
||||
#include "aom_dsp/prob.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct aom_dk_writer {
|
||||
unsigned int lowvalue;
|
||||
unsigned int range;
|
||||
int count;
|
||||
unsigned int pos;
|
||||
uint8_t *buffer;
|
||||
} aom_dk_writer;
|
||||
|
||||
void aom_dk_start_encode(aom_dk_writer *bc, uint8_t *buffer);
|
||||
void aom_dk_stop_encode(aom_dk_writer *bc);
|
||||
|
||||
static INLINE void aom_dk_write(aom_dk_writer *br, int bit, int probability) {
|
||||
unsigned int split;
|
||||
int count = br->count;
|
||||
unsigned int range = br->range;
|
||||
unsigned int lowvalue = br->lowvalue;
|
||||
register int shift;
|
||||
|
||||
#if CONFIG_BITSTREAM_DEBUG
|
||||
// int queue_r = 0;
|
||||
// int frame_idx_r = 0;
|
||||
// int queue_w = bitstream_queue_get_write();
|
||||
// int frame_idx_w = bitstream_queue_get_frame_write();
|
||||
// if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
|
||||
// fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
|
||||
// frame_idx_w, queue_w);
|
||||
// }
|
||||
bitstream_queue_push(bit, probability);
|
||||
#endif // CONFIG_BITSTREAM_DEBUG
|
||||
|
||||
split = 1 + (((range - 1) * probability) >> 8);
|
||||
|
||||
range = split;
|
||||
|
||||
if (bit) {
|
||||
lowvalue += split;
|
||||
range = br->range - split;
|
||||
}
|
||||
|
||||
shift = aom_norm[range];
|
||||
|
||||
range <<= shift;
|
||||
count += shift;
|
||||
|
||||
if (count >= 0) {
|
||||
int offset = shift - count;
|
||||
|
||||
if ((lowvalue << (offset - 1)) & 0x80000000) {
|
||||
int x = br->pos - 1;
|
||||
|
||||
while (x >= 0 && br->buffer[x] == 0xff) {
|
||||
br->buffer[x] = 0;
|
||||
x--;
|
||||
}
|
||||
|
||||
br->buffer[x] += 1;
|
||||
}
|
||||
|
||||
br->buffer[br->pos++] = (lowvalue >> (24 - offset));
|
||||
lowvalue <<= offset;
|
||||
shift = count;
|
||||
lowvalue &= 0xffffff;
|
||||
count -= 8;
|
||||
}
|
||||
|
||||
lowvalue <<= shift;
|
||||
br->count = count;
|
||||
br->lowvalue = lowvalue;
|
||||
br->range = range;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_DKBOOLWRITER_H_
|
||||
@@ -1,80 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "./config.h"
|
||||
#endif
|
||||
|
||||
#include "aom_dsp/entcode.h"
|
||||
|
||||
/*CDFs for uniform probability distributions of small sizes (2 through 16,
|
||||
inclusive).*/
|
||||
// clang-format off
|
||||
const uint16_t OD_UNIFORM_CDFS_Q15[135] = {
|
||||
16384, 32768,
|
||||
10923, 21845, 32768,
|
||||
8192, 16384, 24576, 32768,
|
||||
6554, 13107, 19661, 26214, 32768,
|
||||
5461, 10923, 16384, 21845, 27307, 32768,
|
||||
4681, 9362, 14043, 18725, 23406, 28087, 32768,
|
||||
4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768,
|
||||
3641, 7282, 10923, 14564, 18204, 21845, 25486, 29127, 32768,
|
||||
3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491, 32768,
|
||||
2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789, 32768,
|
||||
2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037,
|
||||
32768,
|
||||
2521, 5041, 7562, 10082, 12603, 15124, 17644, 20165, 22686, 25206, 27727,
|
||||
30247, 32768,
|
||||
2341, 4681, 7022, 9362, 11703, 14043, 16384, 18725, 21065, 23406, 25746,
|
||||
28087, 30427, 32768,
|
||||
2185, 4369, 6554, 8738, 10923, 13107, 15292, 17476, 19661, 21845, 24030,
|
||||
26214, 28399, 30583, 32768,
|
||||
2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528,
|
||||
24576, 26624, 28672, 30720, 32768
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
/*Given the current total integer number of bits used and the current value of
|
||||
rng, computes the fraction number of bits used to OD_BITRES precision.
|
||||
This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
|
||||
nbits_total: The number of whole bits currently used, i.e., the value
|
||||
returned by od_ec_enc_tell() or od_ec_dec_tell().
|
||||
rng: The current value of rng from either the encoder or decoder state.
|
||||
Return: The number of bits scaled by 2**OD_BITRES.
|
||||
This will always be slightly larger than the exact value (e.g., all
|
||||
rounding error is in the positive direction).*/
|
||||
uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
|
||||
uint32_t nbits;
|
||||
int l;
|
||||
int i;
|
||||
/*To handle the non-integral number of bits still left in the encoder/decoder
|
||||
state, we compute the worst-case number of bits of val that must be
|
||||
encoded to ensure that the value is inside the range for any possible
|
||||
subsequent bits.
|
||||
The computation here is independent of val itself (the decoder does not
|
||||
even track that value), even though the real number of bits used after
|
||||
od_ec_enc_done() may be 1 smaller if rng is a power of two and the
|
||||
corresponding trailing bits of val are all zeros.
|
||||
If we did try to track that special case, then coding a value with a
|
||||
probability of 1/(1 << n) might sometimes appear to use more than n bits.
|
||||
This may help explain the surprising result that a newly initialized
|
||||
encoder or decoder claims to have used 1 bit.*/
|
||||
nbits = nbits_total << OD_BITRES;
|
||||
l = 0;
|
||||
for (i = OD_BITRES; i-- > 0;) {
|
||||
int b;
|
||||
rng = rng * rng >> 15;
|
||||
b = (int)(rng >> 16);
|
||||
l = l << 1 | b;
|
||||
rng >>= b;
|
||||
}
|
||||
return nbits - l;
|
||||
}
|
||||
@@ -1,105 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#if !defined(_entcode_H)
|
||||
#define _entcode_H (1)
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
#include "av1/common/odintrin.h"
|
||||
|
||||
/*Set this flag 1 to enable a "reduced overhead" version of the entropy coder.
|
||||
This uses a partition function that more accurately follows the input
|
||||
probability estimates at the expense of some additional CPU cost (though
|
||||
still an order of magnitude less than a full division).
|
||||
|
||||
In classic arithmetic coding, the partition function maps a value x in the
|
||||
range [0, ft] to a value in y in [0, r] with 0 < ft <= r via
|
||||
y = x*r/ft.
|
||||
Any deviation from this value increases coding inefficiency.
|
||||
|
||||
To avoid divisions, we require ft <= r < 2*ft (enforcing it by shifting up
|
||||
ft if necessary), and replace that function with
|
||||
y = x + OD_MINI(x, r - ft).
|
||||
This counts values of x smaller than r - ft double compared to values larger
|
||||
than r - ft, which over-estimates the probability of symbols at the start of
|
||||
the alphabet, and under-estimates the probability of symbols at the end of
|
||||
the alphabet.
|
||||
The overall coding inefficiency assuming accurate probability models and
|
||||
independent symbols is in the 1% range, which is similar to that of CABAC.
|
||||
|
||||
To reduce overhead even further, we split this into two cases:
|
||||
1) r - ft > ft - (r - ft).
|
||||
That is, we have more values of x that are double-counted than
|
||||
single-counted.
|
||||
In this case, we still double-count the first 2*r - 3*ft values of x, but
|
||||
after that we alternate between single-counting and double-counting for
|
||||
the rest.
|
||||
2) r - ft < ft - (r - ft).
|
||||
That is, we have more values of x that are single-counted than
|
||||
double-counted.
|
||||
In this case, we alternate between single-counting and double-counting for
|
||||
the first 2*(r - ft) values of x, and single-count the rest.
|
||||
For two equiprobable symbols in different places in the alphabet, this
|
||||
reduces the maximum ratio of over-estimation to under-estimation from 2:1
|
||||
for the previous partition function to either 4:3 or 3:2 (for each of the
|
||||
two cases above, respectively), assuming symbol probabilities significantly
|
||||
greater than 1/32768.
|
||||
That reduces the worst-case per-symbol overhead from 1 bit to 0.58 bits.
|
||||
|
||||
The resulting function is
|
||||
e = OD_MAXI(2*r - 3*ft, 0);
|
||||
y = x + OD_MINI(x, e) + OD_MINI(OD_MAXI(x - e, 0) >> 1, r - ft).
|
||||
Here, e is a value that is greater than 0 in case 1, and 0 in case 2.
|
||||
This function is about 3 times as expensive to evaluate as the high-overhead
|
||||
version, but still an order of magnitude cheaper than a division, since it
|
||||
is composed only of very simple operations.
|
||||
Because we want to fit in 16-bit registers and must use unsigned values to do
|
||||
so, we use saturating subtraction to enforce the maximums with 0.
|
||||
|
||||
Enabling this reduces the measured overhead in ectest from 0.805% to 0.621%
|
||||
(vs. 0.022% for the division-based partition function with r much greater
|
||||
than ft).
|
||||
It improves performance on ntt-short-1 by about 0.3%.*/
|
||||
#define OD_EC_REDUCED_OVERHEAD (1)
|
||||
|
||||
/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
|
||||
on a larger type, you can speed up the decoder by using it here.*/
|
||||
typedef uint32_t od_ec_window;
|
||||
|
||||
#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
|
||||
|
||||
/*Unsigned subtraction with unsigned saturation.
|
||||
This implementation of the macro is intentionally chosen to increase the
|
||||
number of common subexpressions in the reduced-overhead partition function.
|
||||
This matters for C code, but it would not for hardware with a saturating
|
||||
subtraction instruction.*/
|
||||
#define OD_SUBSATU(a, b) ((a)-OD_MINI(a, b))
|
||||
|
||||
/*The number of bits to use for the range-coded part of unsigned integers.*/
|
||||
#define OD_EC_UINT_BITS (4)
|
||||
|
||||
/*The resolution of fractional-precision bit usage measurements, i.e.,
|
||||
3 => 1/8th bits.*/
|
||||
#define OD_BITRES (3)
|
||||
|
||||
extern const uint16_t OD_UNIFORM_CDFS_Q15[135];
|
||||
|
||||
/*Returns a Q15 CDF for a uniform probability distribution of the given size.
|
||||
n: The size of the distribution.
|
||||
This must be at least 2, and no more than 16.*/
|
||||
#define OD_UNIFORM_CDF_Q15(n) (OD_UNIFORM_CDFS_Q15 + ((n) * ((n)-1) >> 1) - 1)
|
||||
|
||||
/*See entcode.c for further documentation.*/
|
||||
|
||||
OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
|
||||
uint32_t rng);
|
||||
|
||||
#endif
|
||||
494
aom_dsp/entdec.c
494
aom_dsp/entdec.c
@@ -1,494 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "./config.h"
|
||||
#endif
|
||||
|
||||
#include "aom_dsp/entdec.h"
|
||||
|
||||
/*A range decoder.
|
||||
This is an entropy decoder based upon \cite{Mar79}, which is itself a
|
||||
rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
|
||||
It is very similar to arithmetic encoding, except that encoding is done with
|
||||
digits in any base, instead of with bits, and so it is faster when using
|
||||
larger bases (i.e.: a byte).
|
||||
The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
|
||||
is the base, longer than the theoretical optimum, but to my knowledge there
|
||||
is no published justification for this claim.
|
||||
This only seems true when using near-infinite precision arithmetic so that
|
||||
the process is carried out with no rounding errors.
|
||||
|
||||
An excellent description of implementation details is available at
|
||||
http://www.arturocampos.com/ac_range.html
|
||||
A recent work \cite{MNW98} which proposes several changes to arithmetic
|
||||
encoding for efficiency actually re-discovers many of the principles
|
||||
behind range encoding, and presents a good theoretical analysis of them.
|
||||
|
||||
End of stream is handled by writing out the smallest number of bits that
|
||||
ensures that the stream will be correctly decoded regardless of the value of
|
||||
any subsequent bits.
|
||||
od_ec_dec_tell() can be used to determine how many bits were needed to decode
|
||||
all the symbols thus far; other data can be packed in the remaining bits of
|
||||
the input buffer.
|
||||
@PHDTHESIS{Pas76,
|
||||
author="Richard Clark Pasco",
|
||||
title="Source coding algorithms for fast data compression",
|
||||
school="Dept. of Electrical Engineering, Stanford University",
|
||||
address="Stanford, CA",
|
||||
month=May,
|
||||
year=1976,
|
||||
URL="http://www.richpasco.org/scaffdc.pdf"
|
||||
}
|
||||
@INPROCEEDINGS{Mar79,
|
||||
author="Martin, G.N.N.",
|
||||
title="Range encoding: an algorithm for removing redundancy from a digitised
|
||||
message",
|
||||
booktitle="Video & Data Recording Conference",
|
||||
year=1979,
|
||||
address="Southampton",
|
||||
month=Jul,
|
||||
URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
|
||||
}
|
||||
@ARTICLE{MNW98,
|
||||
author="Alistair Moffat and Radford Neal and Ian H. Witten",
|
||||
title="Arithmetic Coding Revisited",
|
||||
journal="{ACM} Transactions on Information Systems",
|
||||
year=1998,
|
||||
volume=16,
|
||||
number=3,
|
||||
pages="256--294",
|
||||
month=Jul,
|
||||
URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
|
||||
}*/
|
||||
|
||||
/*This is meant to be a large, positive constant that can still be efficiently
|
||||
loaded as an immediate (on platforms like ARM, for example).
|
||||
Even relatively modest values like 100 would work fine.*/
|
||||
#define OD_EC_LOTS_OF_BITS (0x4000)
|
||||
|
||||
static void od_ec_dec_refill(od_ec_dec *dec) {
|
||||
int s;
|
||||
od_ec_window dif;
|
||||
int16_t cnt;
|
||||
const unsigned char *bptr;
|
||||
const unsigned char *end;
|
||||
dif = dec->dif;
|
||||
cnt = dec->cnt;
|
||||
bptr = dec->bptr;
|
||||
end = dec->end;
|
||||
s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
|
||||
for (; s >= 0 && bptr < end; s -= 8, bptr++) {
|
||||
OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8);
|
||||
dif |= (od_ec_window)bptr[0] << s;
|
||||
cnt += 8;
|
||||
}
|
||||
if (bptr >= end) {
|
||||
dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
|
||||
cnt = OD_EC_LOTS_OF_BITS;
|
||||
}
|
||||
dec->dif = dif;
|
||||
dec->cnt = cnt;
|
||||
dec->bptr = bptr;
|
||||
}
|
||||
|
||||
/*Takes updated dif and range values, renormalizes them so that
|
||||
32768 <= rng < 65536 (reading more bytes from the stream into dif if
|
||||
necessary), and stores them back in the decoder context.
|
||||
dif: The new value of dif.
|
||||
rng: The new value of the range.
|
||||
ret: The value to return.
|
||||
Return: ret.
|
||||
This allows the compiler to jump to this function via a tail-call.*/
|
||||
static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
|
||||
int ret) {
|
||||
int d;
|
||||
OD_ASSERT(rng <= 65535U);
|
||||
d = 16 - OD_ILOG_NZ(rng);
|
||||
dec->cnt -= d;
|
||||
dec->dif = dif << d;
|
||||
dec->rng = rng << d;
|
||||
if (dec->cnt < 0) od_ec_dec_refill(dec);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*Initializes the decoder.
|
||||
buf: The input buffer to use.
|
||||
Return: 0 on success, or a negative value on error.*/
|
||||
void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
|
||||
uint32_t storage) {
|
||||
dec->buf = buf;
|
||||
dec->eptr = buf + storage;
|
||||
dec->end_window = 0;
|
||||
dec->nend_bits = 0;
|
||||
dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
|
||||
dec->end = buf + storage;
|
||||
dec->bptr = buf;
|
||||
dec->dif = 0;
|
||||
dec->rng = 0x8000;
|
||||
dec->cnt = -15;
|
||||
dec->error = 0;
|
||||
od_ec_dec_refill(dec);
|
||||
}
|
||||
|
||||
/*Decode a bit that has an fz/ft probability of being a zero.
|
||||
fz: The probability that the bit is zero, scaled by _ft.
|
||||
ft: The total probability.
|
||||
This must be at least 16384 and no more than 32768.
|
||||
Return: The value decoded (0 or 1).*/
|
||||
int od_ec_decode_bool(od_ec_dec *dec, unsigned fz, unsigned ft) {
|
||||
od_ec_window dif;
|
||||
od_ec_window vw;
|
||||
unsigned r;
|
||||
int s;
|
||||
unsigned v;
|
||||
int ret;
|
||||
OD_ASSERT(0 < fz);
|
||||
OD_ASSERT(fz < ft);
|
||||
OD_ASSERT(16384 <= ft);
|
||||
OD_ASSERT(ft <= 32768U);
|
||||
dif = dec->dif;
|
||||
r = dec->rng;
|
||||
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
|
||||
OD_ASSERT(ft <= r);
|
||||
s = r - ft >= ft;
|
||||
ft <<= s;
|
||||
fz <<= s;
|
||||
OD_ASSERT(r - ft < ft);
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
{
|
||||
unsigned d;
|
||||
unsigned e;
|
||||
d = r - ft;
|
||||
e = OD_SUBSATU(2 * d, ft);
|
||||
v = fz + OD_MINI(fz, e) + OD_MINI(OD_SUBSATU(fz, e) >> 1, d);
|
||||
}
|
||||
#else
|
||||
v = fz + OD_MINI(fz, r - ft);
|
||||
#endif
|
||||
vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
|
||||
ret = dif >= vw;
|
||||
if (ret) dif -= vw;
|
||||
r = ret ? r - v : v;
|
||||
return od_ec_dec_normalize(dec, dif, r, ret);
|
||||
}
|
||||
|
||||
/*Decode a bit that has an fz probability of being a zero in Q15.
|
||||
This is a simpler, lower overhead version of od_ec_decode_bool() for use when
|
||||
ft == 32768.
|
||||
To be decoded properly by this function, symbols cannot have been encoded by
|
||||
od_ec_encode(), but must have been encoded with one of the equivalent _q15()
|
||||
or _dyadic() functions instead.
|
||||
fz: The probability that the bit is zero, scaled by 32768.
|
||||
Return: The value decoded (0 or 1).*/
|
||||
int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned fz) {
|
||||
od_ec_window dif;
|
||||
od_ec_window vw;
|
||||
unsigned r;
|
||||
unsigned r_new;
|
||||
unsigned v;
|
||||
int ret;
|
||||
OD_ASSERT(0 < fz);
|
||||
OD_ASSERT(fz < 32768U);
|
||||
dif = dec->dif;
|
||||
r = dec->rng;
|
||||
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
|
||||
OD_ASSERT(32768U <= r);
|
||||
v = fz * (uint32_t)r >> 15;
|
||||
vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
|
||||
ret = 0;
|
||||
r_new = v;
|
||||
if (dif >= vw) {
|
||||
r_new = r - v;
|
||||
dif -= vw;
|
||||
ret = 1;
|
||||
}
|
||||
return od_ec_dec_normalize(dec, dif, r_new, ret);
|
||||
}
|
||||
|
||||
/*Decodes a symbol given a cumulative distribution function (CDF) table.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-increasing, and cdf[nsyms - 1]
|
||||
must be at least 16384, and no more than 32768.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.
|
||||
Return: The decoded symbol s.*/
|
||||
int od_ec_decode_cdf(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
|
||||
od_ec_window dif;
|
||||
unsigned r;
|
||||
unsigned c;
|
||||
unsigned d;
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
unsigned e;
|
||||
#endif
|
||||
int s;
|
||||
unsigned u;
|
||||
unsigned v;
|
||||
unsigned q;
|
||||
unsigned fl;
|
||||
unsigned fh;
|
||||
unsigned ft;
|
||||
int ret;
|
||||
dif = dec->dif;
|
||||
r = dec->rng;
|
||||
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
|
||||
OD_ASSERT(nsyms > 0);
|
||||
ft = cdf[nsyms - 1];
|
||||
OD_ASSERT(16384 <= ft);
|
||||
OD_ASSERT(ft <= 32768U);
|
||||
OD_ASSERT(ft <= r);
|
||||
s = r - ft >= ft;
|
||||
ft <<= s;
|
||||
d = r - ft;
|
||||
OD_ASSERT(d < ft);
|
||||
c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
|
||||
q = OD_MAXI((int)(c >> 1), (int)(c - d));
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
e = OD_SUBSATU(2 * d, ft);
|
||||
/*The correctness of this inverse partition function is not obvious, but it
|
||||
was checked exhaustively for all possible values of r, ft, and c.
|
||||
TODO: It should be possible to optimize this better than the compiler,
|
||||
given that we do not care about the accuracy of negative results (as we
|
||||
will not use them).
|
||||
It would also be nice to get rid of the 32-bit dividend, as it requires a
|
||||
32x32->64 bit multiply to invert.*/
|
||||
q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
|
||||
#endif
|
||||
q >>= s;
|
||||
OD_ASSERT(q<ft>> s);
|
||||
fl = 0;
|
||||
ret = 0;
|
||||
for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
|
||||
OD_ASSERT(fh <= ft >> s);
|
||||
fl <<= s;
|
||||
fh <<= s;
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
|
||||
v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
|
||||
#else
|
||||
u = fl + OD_MINI(fl, d);
|
||||
v = fh + OD_MINI(fh, d);
|
||||
#endif
|
||||
r = v - u;
|
||||
dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
|
||||
return od_ec_dec_normalize(dec, dif, r, ret);
|
||||
}
|
||||
|
||||
/*Decodes a symbol given a cumulative distribution function (CDF) table.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-increasing, and cdf[nsyms - 1]
|
||||
must be at least 2, and no more than 32768.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.
|
||||
Return: The decoded symbol s.*/
|
||||
int od_ec_decode_cdf_unscaled(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
|
||||
od_ec_window dif;
|
||||
unsigned r;
|
||||
unsigned c;
|
||||
unsigned d;
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
unsigned e;
|
||||
#endif
|
||||
int s;
|
||||
unsigned u;
|
||||
unsigned v;
|
||||
unsigned q;
|
||||
unsigned fl;
|
||||
unsigned fh;
|
||||
unsigned ft;
|
||||
int ret;
|
||||
dif = dec->dif;
|
||||
r = dec->rng;
|
||||
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
|
||||
OD_ASSERT(nsyms > 0);
|
||||
ft = cdf[nsyms - 1];
|
||||
OD_ASSERT(2 <= ft);
|
||||
OD_ASSERT(ft <= 32768U);
|
||||
s = 15 - OD_ILOG_NZ(ft - 1);
|
||||
ft <<= s;
|
||||
OD_ASSERT(ft <= r);
|
||||
if (r - ft >= ft) {
|
||||
ft <<= 1;
|
||||
s++;
|
||||
}
|
||||
d = r - ft;
|
||||
OD_ASSERT(d < ft);
|
||||
c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
|
||||
q = OD_MAXI((int)(c >> 1), (int)(c - d));
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
e = OD_SUBSATU(2 * d, ft);
|
||||
/*TODO: See TODO above.*/
|
||||
q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
|
||||
#endif
|
||||
q >>= s;
|
||||
OD_ASSERT(q<ft>> s);
|
||||
fl = 0;
|
||||
ret = 0;
|
||||
for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
|
||||
OD_ASSERT(fh <= ft >> s);
|
||||
fl <<= s;
|
||||
fh <<= s;
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
|
||||
v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
|
||||
#else
|
||||
u = fl + OD_MINI(fl, d);
|
||||
v = fh + OD_MINI(fh, d);
|
||||
#endif
|
||||
r = v - u;
|
||||
dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
|
||||
return od_ec_dec_normalize(dec, dif, r, ret);
|
||||
}
|
||||
|
||||
/*Decodes a symbol given a cumulative distribution function (CDF) table that
|
||||
sums to a power of two.
|
||||
This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
|
||||
cdf[nsyms - 1] is a power of two.
|
||||
To be decoded properly by this function, symbols cannot have been encoded by
|
||||
od_ec_encode(), but must have been encoded with one of the equivalent _q15()
|
||||
functions instead.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-increasing, and cdf[nsyms - 1]
|
||||
must be exactly 1 << ftb.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.
|
||||
ftb: The number of bits of precision in the cumulative distribution.
|
||||
This must be no more than 15.
|
||||
Return: The decoded symbol s.*/
|
||||
int od_ec_decode_cdf_unscaled_dyadic(od_ec_dec *dec, const uint16_t *cdf,
|
||||
int nsyms, unsigned ftb) {
|
||||
od_ec_window dif;
|
||||
unsigned r;
|
||||
unsigned c;
|
||||
unsigned u;
|
||||
unsigned v;
|
||||
int ret;
|
||||
(void)nsyms;
|
||||
dif = dec->dif;
|
||||
r = dec->rng;
|
||||
OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
|
||||
OD_ASSERT(ftb <= 15);
|
||||
OD_ASSERT(cdf[nsyms - 1] == 1U << ftb);
|
||||
OD_ASSERT(32768U <= r);
|
||||
c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
|
||||
v = 0;
|
||||
ret = -1;
|
||||
do {
|
||||
u = v;
|
||||
v = cdf[++ret] * (uint32_t)r >> ftb;
|
||||
} while (v <= c);
|
||||
OD_ASSERT(v <= r);
|
||||
r = v - u;
|
||||
dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
|
||||
return od_ec_dec_normalize(dec, dif, r, ret);
|
||||
}
|
||||
|
||||
/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15.
|
||||
This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
|
||||
cdf[nsyms - 1] == 32768.
|
||||
To be decoded properly by this function, symbols cannot have been encoded by
|
||||
od_ec_encode(), but must have been encoded with one of the equivalent _q15()
|
||||
or dyadic() functions instead.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-increasing, and cdf[nsyms - 1]
|
||||
must be 32768.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.
|
||||
Return: The decoded symbol s.*/
|
||||
int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) {
|
||||
return od_ec_decode_cdf_unscaled_dyadic(dec, cdf, nsyms, 15);
|
||||
}
|
||||
|
||||
/*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
|
||||
The integer must have been encoded with od_ec_enc_uint().
|
||||
ft: The number of integers that can be decoded (one more than the max).
|
||||
This must be at least 2, and no more than 2**29.
|
||||
Return: The decoded bits.*/
|
||||
uint32_t od_ec_dec_uint(od_ec_dec *dec, uint32_t ft) {
|
||||
OD_ASSERT(ft >= 2);
|
||||
OD_ASSERT(ft <= (uint32_t)1 << (25 + OD_EC_UINT_BITS));
|
||||
if (ft > 1U << OD_EC_UINT_BITS) {
|
||||
uint32_t t;
|
||||
int ft1;
|
||||
int ftb;
|
||||
ft--;
|
||||
ftb = OD_ILOG_NZ(ft) - OD_EC_UINT_BITS;
|
||||
ft1 = (int)(ft >> ftb) + 1;
|
||||
t = od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft1), ft1);
|
||||
t = t << ftb | od_ec_dec_bits(dec, ftb, "");
|
||||
if (t <= ft) return t;
|
||||
dec->error = 1;
|
||||
return ft;
|
||||
}
|
||||
return od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft), (int)ft);
|
||||
}
|
||||
|
||||
/*Extracts a sequence of raw bits from the stream.
|
||||
The bits must have been encoded with od_ec_enc_bits().
|
||||
ftb: The number of bits to extract.
|
||||
This must be between 0 and 25, inclusive.
|
||||
Return: The decoded bits.*/
|
||||
uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) {
|
||||
od_ec_window window;
|
||||
int available;
|
||||
uint32_t ret;
|
||||
OD_ASSERT(ftb <= 25);
|
||||
window = dec->end_window;
|
||||
available = dec->nend_bits;
|
||||
if ((unsigned)available < ftb) {
|
||||
const unsigned char *buf;
|
||||
const unsigned char *eptr;
|
||||
buf = dec->buf;
|
||||
eptr = dec->eptr;
|
||||
OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8);
|
||||
do {
|
||||
if (eptr <= buf) {
|
||||
dec->tell_offs += OD_EC_LOTS_OF_BITS - available;
|
||||
available = OD_EC_LOTS_OF_BITS;
|
||||
break;
|
||||
}
|
||||
window |= (od_ec_window) * --eptr << available;
|
||||
available += 8;
|
||||
} while (available <= OD_EC_WINDOW_SIZE - 8);
|
||||
dec->eptr = eptr;
|
||||
}
|
||||
ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1);
|
||||
window >>= ftb;
|
||||
available -= ftb;
|
||||
dec->end_window = window;
|
||||
dec->nend_bits = available;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*Returns the number of bits "used" by the decoded symbols so far.
|
||||
This same number can be computed in either the encoder or the decoder, and is
|
||||
suitable for making coding decisions.
|
||||
Return: The number of bits.
|
||||
This will always be slightly larger than the exact value (e.g., all
|
||||
rounding error is in the positive direction).*/
|
||||
int od_ec_dec_tell(const od_ec_dec *dec) {
|
||||
return ((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 - dec->cnt -
|
||||
dec->nend_bits + dec->tell_offs;
|
||||
}
|
||||
|
||||
/*Returns the number of bits "used" by the decoded symbols so far.
|
||||
This same number can be computed in either the encoder or the decoder, and is
|
||||
suitable for making coding decisions.
|
||||
Return: The number of bits scaled by 2**OD_BITRES.
|
||||
This will always be slightly larger than the exact value (e.g., all
|
||||
rounding error is in the positive direction).*/
|
||||
uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
|
||||
return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
|
||||
}
|
||||
101
aom_dsp/entdec.h
101
aom_dsp/entdec.h
@@ -1,101 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#if !defined(_entdec_H)
|
||||
#define _entdec_H (1)
|
||||
#include <limits.h>
|
||||
#include "aom_dsp/entcode.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct od_ec_dec od_ec_dec;
|
||||
|
||||
#if OD_ACCOUNTING
|
||||
#define OD_ACC_STR , char *acc_str
|
||||
#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
|
||||
#else
|
||||
#define OD_ACC_STR
|
||||
#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
|
||||
#endif
|
||||
|
||||
/*The entropy decoder context.*/
|
||||
struct od_ec_dec {
|
||||
/*The start of the current input buffer.*/
|
||||
const unsigned char *buf;
|
||||
/*The read pointer for the raw bits.*/
|
||||
const unsigned char *eptr;
|
||||
/*Bits that will be read from/written at the end.*/
|
||||
od_ec_window end_window;
|
||||
/*Number of valid bits in end_window.*/
|
||||
int nend_bits;
|
||||
/*An offset used to keep track of tell after reaching the end of the stream.
|
||||
This is constant throughout most of the decoding process, but becomes
|
||||
important once we hit the end of the buffer and stop incrementing pointers
|
||||
(and instead pretend cnt/nend_bits have lots of bits).*/
|
||||
int32_t tell_offs;
|
||||
/*The end of the current input buffer.*/
|
||||
const unsigned char *end;
|
||||
/*The read pointer for the entropy-coded bits.*/
|
||||
const unsigned char *bptr;
|
||||
/*The difference between the coded value and the low end of the current
|
||||
range.*/
|
||||
od_ec_window dif;
|
||||
/*The number of values in the current range.*/
|
||||
uint16_t rng;
|
||||
/*The number of bits of data in the current value.*/
|
||||
int16_t cnt;
|
||||
/*Nonzero if an error occurred.*/
|
||||
int error;
|
||||
};
|
||||
|
||||
/*See entdec.c for further documentation.*/
|
||||
|
||||
void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
|
||||
OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
|
||||
|
||||
OD_WARN_UNUSED_RESULT int od_ec_decode_bool(od_ec_dec *dec, unsigned fz,
|
||||
unsigned ft) OD_ARG_NONNULL(1);
|
||||
OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned fz)
|
||||
OD_ARG_NONNULL(1);
|
||||
OD_WARN_UNUSED_RESULT int od_ec_decode_cdf(od_ec_dec *dec, const uint16_t *cdf,
|
||||
int nsyms) OD_ARG_NONNULL(1)
|
||||
OD_ARG_NONNULL(2);
|
||||
OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
|
||||
const uint16_t *cdf, int nsyms)
|
||||
OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
|
||||
OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_unscaled(od_ec_dec *dec,
|
||||
const uint16_t *cdf,
|
||||
int nsyms) OD_ARG_NONNULL(1)
|
||||
OD_ARG_NONNULL(2);
|
||||
OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_unscaled_dyadic(od_ec_dec *dec,
|
||||
const uint16_t *cdf,
|
||||
int nsyms,
|
||||
unsigned _ftb)
|
||||
OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
|
||||
|
||||
OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_uint(od_ec_dec *dec, uint32_t ft)
|
||||
OD_ARG_NONNULL(1);
|
||||
|
||||
OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
|
||||
OD_ARG_NONNULL(1);
|
||||
|
||||
OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
|
||||
OD_ARG_NONNULL(1);
|
||||
OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
|
||||
OD_ARG_NONNULL(1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
686
aom_dsp/entenc.c
686
aom_dsp/entenc.c
@@ -1,686 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "./config.h"
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "aom_dsp/entenc.h"
|
||||
|
||||
/*A range encoder.
|
||||
See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
|
||||
|
||||
@INPROCEEDINGS{Mar79,
|
||||
author="Martin, G.N.N.",
|
||||
title="Range encoding: an algorithm for removing redundancy from a digitised
|
||||
message",
|
||||
booktitle="Video \& Data Recording Conference",
|
||||
year=1979,
|
||||
address="Southampton",
|
||||
month=Jul,
|
||||
URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
|
||||
}
|
||||
@ARTICLE{MNW98,
|
||||
author="Alistair Moffat and Radford Neal and Ian H. Witten",
|
||||
title="Arithmetic Coding Revisited",
|
||||
journal="{ACM} Transactions on Information Systems",
|
||||
year=1998,
|
||||
volume=16,
|
||||
number=3,
|
||||
pages="256--294",
|
||||
month=Jul,
|
||||
URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
|
||||
}*/
|
||||
|
||||
/*Takes updated low and range values, renormalizes them so that
|
||||
32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
|
||||
necessary), and stores them back in the encoder context.
|
||||
low: The new value of low.
|
||||
rng: The new value of the range.*/
|
||||
static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
|
||||
unsigned rng) {
|
||||
int d;
|
||||
int c;
|
||||
int s;
|
||||
c = enc->cnt;
|
||||
OD_ASSERT(rng <= 65535U);
|
||||
d = 16 - OD_ILOG_NZ(rng);
|
||||
s = c + d;
|
||||
/*TODO: Right now we flush every time we have at least one byte available.
|
||||
Instead we should use an od_ec_window and flush right before we're about to
|
||||
shift bits off the end of the window.
|
||||
For a 32-bit window this is about the same amount of work, but for a 64-bit
|
||||
window it should be a fair win.*/
|
||||
if (s >= 0) {
|
||||
uint16_t *buf;
|
||||
uint32_t storage;
|
||||
uint32_t offs;
|
||||
unsigned m;
|
||||
buf = enc->precarry_buf;
|
||||
storage = enc->precarry_storage;
|
||||
offs = enc->offs;
|
||||
if (offs + 2 > storage) {
|
||||
storage = 2 * storage + 2;
|
||||
buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
|
||||
if (buf == NULL) {
|
||||
enc->error = -1;
|
||||
enc->offs = 0;
|
||||
return;
|
||||
}
|
||||
enc->precarry_buf = buf;
|
||||
enc->precarry_storage = storage;
|
||||
}
|
||||
c += 16;
|
||||
m = (1 << c) - 1;
|
||||
if (s >= 8) {
|
||||
OD_ASSERT(offs < storage);
|
||||
buf[offs++] = (uint16_t)(low >> c);
|
||||
low &= m;
|
||||
c -= 8;
|
||||
m >>= 8;
|
||||
}
|
||||
OD_ASSERT(offs < storage);
|
||||
buf[offs++] = (uint16_t)(low >> c);
|
||||
s = c + d - 24;
|
||||
low &= m;
|
||||
enc->offs = offs;
|
||||
}
|
||||
enc->low = low << d;
|
||||
enc->rng = rng << d;
|
||||
enc->cnt = s;
|
||||
}
|
||||
|
||||
/*Initializes the encoder.
|
||||
size: The initial size of the buffer, in bytes.*/
|
||||
void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
|
||||
od_ec_enc_reset(enc);
|
||||
enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
|
||||
enc->storage = size;
|
||||
if (size > 0 && enc->buf == NULL) {
|
||||
enc->storage = 0;
|
||||
enc->error = -1;
|
||||
}
|
||||
enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
|
||||
enc->precarry_storage = size;
|
||||
if (size > 0 && enc->precarry_buf == NULL) {
|
||||
enc->precarry_storage = 0;
|
||||
enc->error = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/*Reinitializes the encoder.*/
|
||||
void od_ec_enc_reset(od_ec_enc *enc) {
|
||||
enc->end_offs = 0;
|
||||
enc->end_window = 0;
|
||||
enc->nend_bits = 0;
|
||||
enc->offs = 0;
|
||||
enc->low = 0;
|
||||
enc->rng = 0x8000;
|
||||
/*This is initialized to -9 so that it crosses zero after we've accumulated
|
||||
one byte + one carry bit.*/
|
||||
enc->cnt = -9;
|
||||
enc->error = 0;
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
enc->entropy = 0;
|
||||
enc->nb_symbols = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*Frees the buffers used by the encoder.*/
|
||||
void od_ec_enc_clear(od_ec_enc *enc) {
|
||||
free(enc->precarry_buf);
|
||||
free(enc->buf);
|
||||
}
|
||||
|
||||
/*Encodes a symbol given its scaled frequency information.
|
||||
The frequency information must be discernable by the decoder, assuming it
|
||||
has read only the previous symbols from the stream.
|
||||
You can change the frequency information, or even the entire source alphabet,
|
||||
so long as the decoder can tell from the context of the previously encoded
|
||||
information that it is supposed to do so as well.
|
||||
fl: The cumulative frequency of all symbols that come before the one to be
|
||||
encoded.
|
||||
fh: The cumulative frequency of all symbols up to and including the one to
|
||||
be encoded.
|
||||
Together with fl, this defines the range [fl, fh) in which the decoded
|
||||
value will fall.
|
||||
ft: The sum of the frequencies of all the symbols.
|
||||
This must be at least 16384, and no more than 32768.*/
|
||||
static void od_ec_encode(od_ec_enc *enc, unsigned fl, unsigned fh,
|
||||
unsigned ft) {
|
||||
od_ec_window l;
|
||||
unsigned r;
|
||||
int s;
|
||||
unsigned d;
|
||||
unsigned u;
|
||||
unsigned v;
|
||||
OD_ASSERT(fl < fh);
|
||||
OD_ASSERT(fh <= ft);
|
||||
OD_ASSERT(16384 <= ft);
|
||||
OD_ASSERT(ft <= 32768U);
|
||||
l = enc->low;
|
||||
r = enc->rng;
|
||||
OD_ASSERT(ft <= r);
|
||||
s = r - ft >= ft;
|
||||
ft <<= s;
|
||||
fl <<= s;
|
||||
fh <<= s;
|
||||
d = r - ft;
|
||||
OD_ASSERT(d < ft);
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
{
|
||||
unsigned e;
|
||||
e = OD_SUBSATU(2 * d, ft);
|
||||
u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
|
||||
v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
|
||||
}
|
||||
#else
|
||||
u = fl + OD_MINI(fl, d);
|
||||
v = fh + OD_MINI(fh, d);
|
||||
#endif
|
||||
r = v - u;
|
||||
l += u;
|
||||
od_ec_enc_normalize(enc, l, r);
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
enc->entropy -= OD_LOG2((double)(fh - fl) / ft);
|
||||
enc->nb_symbols++;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*Encodes a symbol given its frequency in Q15.
|
||||
This is like od_ec_encode() when ft == 32768, but is simpler and has lower
|
||||
overhead.
|
||||
Symbols encoded with this function cannot be properly decoded with
|
||||
od_ec_decode(), and must be decoded with one of the equivalent _q15()
|
||||
functions instead.
|
||||
fl: The cumulative frequency of all symbols that come before the one to be
|
||||
encoded.
|
||||
fh: The cumulative frequency of all symbols up to and including the one to
|
||||
be encoded.*/
|
||||
static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) {
|
||||
od_ec_window l;
|
||||
unsigned r;
|
||||
unsigned u;
|
||||
unsigned v;
|
||||
OD_ASSERT(fl < fh);
|
||||
OD_ASSERT(fh <= 32768U);
|
||||
l = enc->low;
|
||||
r = enc->rng;
|
||||
OD_ASSERT(32768U <= r);
|
||||
u = fl * (uint32_t)r >> 15;
|
||||
v = fh * (uint32_t)r >> 15;
|
||||
r = v - u;
|
||||
l += u;
|
||||
od_ec_enc_normalize(enc, l, r);
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
enc->entropy -= OD_LOG2((double)(fh - fl) / 32768.);
|
||||
enc->nb_symbols++;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*Encodes a symbol given its frequency information with an arbitrary scale.
|
||||
This operates just like od_ec_encode(), but does not require that ft be at
|
||||
least 16384.
|
||||
fl: The cumulative frequency of all symbols that come before the one to be
|
||||
encoded.
|
||||
fh: The cumulative frequency of all symbols up to and including the one to
|
||||
be encoded.
|
||||
ft: The sum of the frequencies of all the symbols.
|
||||
This must be at least 2 and no more than 32768.*/
|
||||
static void od_ec_encode_unscaled(od_ec_enc *enc, unsigned fl, unsigned fh,
|
||||
unsigned ft) {
|
||||
int s;
|
||||
OD_ASSERT(fl < fh);
|
||||
OD_ASSERT(fh <= ft);
|
||||
OD_ASSERT(2 <= ft);
|
||||
OD_ASSERT(ft <= 32768U);
|
||||
s = 15 - OD_ILOG_NZ(ft - 1);
|
||||
od_ec_encode(enc, fl << s, fh << s, ft << s);
|
||||
}
|
||||
|
||||
/*Encode a bit that has an fz/ft probability of being a zero.
|
||||
val: The value to encode (0 or 1).
|
||||
fz: The probability that val is zero, scaled by ft.
|
||||
ft: The total probability.
|
||||
This must be at least 16384 and no more than 32768.*/
|
||||
void od_ec_encode_bool(od_ec_enc *enc, int val, unsigned fz, unsigned ft) {
|
||||
od_ec_window l;
|
||||
unsigned r;
|
||||
int s;
|
||||
unsigned v;
|
||||
OD_ASSERT(0 < fz);
|
||||
OD_ASSERT(fz < ft);
|
||||
OD_ASSERT(16384 <= ft);
|
||||
OD_ASSERT(ft <= 32768U);
|
||||
l = enc->low;
|
||||
r = enc->rng;
|
||||
OD_ASSERT(ft <= r);
|
||||
s = r - ft >= ft;
|
||||
ft <<= s;
|
||||
fz <<= s;
|
||||
OD_ASSERT(r - ft < ft);
|
||||
#if OD_EC_REDUCED_OVERHEAD
|
||||
{
|
||||
unsigned d;
|
||||
unsigned e;
|
||||
d = r - ft;
|
||||
e = OD_SUBSATU(2 * d, ft);
|
||||
v = fz + OD_MINI(fz, e) + OD_MINI(OD_SUBSATU(fz, e) >> 1, d);
|
||||
}
|
||||
#else
|
||||
v = fz + OD_MINI(fz, r - ft);
|
||||
#endif
|
||||
if (val) l += v;
|
||||
r = val ? r - v : v;
|
||||
od_ec_enc_normalize(enc, l, r);
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
enc->entropy -= OD_LOG2((double)(val ? ft - fz : fz) / ft);
|
||||
enc->nb_symbols++;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*Encode a bit that has an fz probability of being a zero in Q15.
|
||||
This is a simpler, lower overhead version of od_ec_encode_bool() for use when
|
||||
ft == 32768.
|
||||
Symbols encoded with this function cannot be properly decoded with
|
||||
od_ec_decode(), and must be decoded with one of the equivalent _q15()
|
||||
functions instead.
|
||||
val: The value to encode (0 or 1).
|
||||
fz: The probability that val is zero, scaled by 32768.*/
|
||||
void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned fz) {
|
||||
od_ec_window l;
|
||||
unsigned r;
|
||||
unsigned v;
|
||||
OD_ASSERT(0 < fz);
|
||||
OD_ASSERT(fz < 32768U);
|
||||
l = enc->low;
|
||||
r = enc->rng;
|
||||
OD_ASSERT(32768U <= r);
|
||||
v = fz * (uint32_t)r >> 15;
|
||||
if (val) l += v;
|
||||
r = val ? r - v : v;
|
||||
od_ec_enc_normalize(enc, l, r);
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
enc->entropy -= OD_LOG2((double)(val ? 32768 - fz : fz) / 32768.);
|
||||
enc->nb_symbols++;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*Encodes a symbol given a cumulative distribution function (CDF) table.
|
||||
s: The index of the symbol to encode.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-decreasing, and the last value
|
||||
must be at least 16384, and no more than 32768.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.*/
|
||||
void od_ec_encode_cdf(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) {
|
||||
OD_ASSERT(s >= 0);
|
||||
OD_ASSERT(s < nsyms);
|
||||
od_ec_encode(enc, s > 0 ? cdf[s - 1] : 0, cdf[s], cdf[nsyms - 1]);
|
||||
}
|
||||
|
||||
/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
|
||||
This is a simpler, lower overhead version of od_ec_encode_cdf() for use when
|
||||
cdf[nsyms - 1] == 32768.
|
||||
Symbols encoded with this function cannot be properly decoded with
|
||||
od_ec_decode(), and must be decoded with one of the equivalent _q15()
|
||||
functions instead.
|
||||
s: The index of the symbol to encode.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-decreasing, and the last value
|
||||
must be exactly 32768.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.*/
|
||||
void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf,
|
||||
int nsyms) {
|
||||
(void)nsyms;
|
||||
OD_ASSERT(s >= 0);
|
||||
OD_ASSERT(s < nsyms);
|
||||
OD_ASSERT(cdf[nsyms - 1] == 32768U);
|
||||
od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : 0, cdf[s]);
|
||||
}
|
||||
|
||||
/*Encodes a symbol given a cumulative distribution function (CDF) table.
|
||||
s: The index of the symbol to encode.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-decreasing, and the last value
|
||||
must be at least 2, and no more than 32768.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.*/
|
||||
void od_ec_encode_cdf_unscaled(od_ec_enc *enc, int s, const uint16_t *cdf,
|
||||
int nsyms) {
|
||||
OD_ASSERT(s >= 0);
|
||||
OD_ASSERT(s < nsyms);
|
||||
od_ec_encode_unscaled(enc, s > 0 ? cdf[s - 1] : 0, cdf[s], cdf[nsyms - 1]);
|
||||
}
|
||||
|
||||
/*Equivalent to od_ec_encode_cdf_q15() with the cdf scaled by
|
||||
(1 << (15 - ftb)).
|
||||
s: The index of the symbol to encode.
|
||||
cdf: The CDF, such that symbol s falls in the range
|
||||
[s > 0 ? cdf[s - 1] : 0, cdf[s]).
|
||||
The values must be monotonically non-decreasing, and the last value
|
||||
must be exactly 1 << ftb.
|
||||
nsyms: The number of symbols in the alphabet.
|
||||
This should be at most 16.
|
||||
ftb: The number of bits of precision in the cumulative distribution.
|
||||
This must be no more than 15.*/
|
||||
void od_ec_encode_cdf_unscaled_dyadic(od_ec_enc *enc, int s,
|
||||
const uint16_t *cdf, int nsyms,
|
||||
unsigned ftb) {
|
||||
(void)nsyms;
|
||||
OD_ASSERT(s >= 0);
|
||||
OD_ASSERT(s < nsyms);
|
||||
OD_ASSERT(ftb <= 15);
|
||||
OD_ASSERT(cdf[nsyms - 1] == 1U << ftb);
|
||||
od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] << (15 - ftb) : 0,
|
||||
cdf[s] << (15 - ftb));
|
||||
}
|
||||
|
||||
/*Encodes a raw unsigned integer in the stream.
|
||||
fl: The integer to encode.
|
||||
ft: The number of integers that can be encoded (one more than the max).
|
||||
This must be at least 2, and no more than 2**29.*/
|
||||
void od_ec_enc_uint(od_ec_enc *enc, uint32_t fl, uint32_t ft) {
|
||||
OD_ASSERT(ft >= 2);
|
||||
OD_ASSERT(fl < ft);
|
||||
OD_ASSERT(ft <= (uint32_t)1 << (25 + OD_EC_UINT_BITS));
|
||||
if (ft > 1U << OD_EC_UINT_BITS) {
|
||||
int ft1;
|
||||
int ftb;
|
||||
ft--;
|
||||
ftb = OD_ILOG_NZ(ft) - OD_EC_UINT_BITS;
|
||||
ft1 = (int)(ft >> ftb) + 1;
|
||||
od_ec_encode_cdf_q15(enc, (int)(fl >> ftb), OD_UNIFORM_CDF_Q15(ft1), ft1);
|
||||
od_ec_enc_bits(enc, fl & (((uint32_t)1 << ftb) - 1), ftb);
|
||||
} else {
|
||||
od_ec_encode_cdf_q15(enc, (int)fl, OD_UNIFORM_CDF_Q15(ft), (int)ft);
|
||||
}
|
||||
}
|
||||
|
||||
/*Encodes a sequence of raw bits in the stream.
|
||||
fl: The bits to encode.
|
||||
ftb: The number of bits to encode.
|
||||
This must be between 0 and 25, inclusive.*/
|
||||
void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) {
|
||||
od_ec_window end_window;
|
||||
int nend_bits;
|
||||
OD_ASSERT(ftb <= 25);
|
||||
OD_ASSERT(fl < (uint32_t)1 << ftb);
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
enc->entropy += ftb;
|
||||
#endif
|
||||
end_window = enc->end_window;
|
||||
nend_bits = enc->nend_bits;
|
||||
if (nend_bits + ftb > OD_EC_WINDOW_SIZE) {
|
||||
unsigned char *buf;
|
||||
uint32_t storage;
|
||||
uint32_t end_offs;
|
||||
buf = enc->buf;
|
||||
storage = enc->storage;
|
||||
end_offs = enc->end_offs;
|
||||
if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) {
|
||||
unsigned char *new_buf;
|
||||
uint32_t new_storage;
|
||||
new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3);
|
||||
new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage);
|
||||
if (new_buf == NULL) {
|
||||
enc->error = -1;
|
||||
enc->end_offs = 0;
|
||||
return;
|
||||
}
|
||||
OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs,
|
||||
end_offs);
|
||||
storage = new_storage;
|
||||
free(buf);
|
||||
enc->buf = buf = new_buf;
|
||||
enc->storage = storage;
|
||||
}
|
||||
do {
|
||||
OD_ASSERT(end_offs < storage);
|
||||
buf[storage - ++end_offs] = (unsigned char)end_window;
|
||||
end_window >>= 8;
|
||||
nend_bits -= 8;
|
||||
} while (nend_bits >= 8);
|
||||
enc->end_offs = end_offs;
|
||||
}
|
||||
OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE);
|
||||
end_window |= (od_ec_window)fl << nend_bits;
|
||||
nend_bits += ftb;
|
||||
enc->end_window = end_window;
|
||||
enc->nend_bits = nend_bits;
|
||||
}
|
||||
|
||||
/*Overwrites a few bits at the very start of an existing stream, after they
|
||||
have already been encoded.
|
||||
This makes it possible to have a few flags up front, where it is easy for
|
||||
decoders to access them without parsing the whole stream, even if their
|
||||
values are not determined until late in the encoding process, without having
|
||||
to buffer all the intermediate symbols in the encoder.
|
||||
In order for this to work, at least nbits bits must have already been encoded
|
||||
using probabilities that are an exact power of two.
|
||||
The encoder can verify the number of encoded bits is sufficient, but cannot
|
||||
check this latter condition.
|
||||
val: The bits to encode (in the least nbits significant bits).
|
||||
They will be decoded in order from most-significant to least.
|
||||
nbits: The number of bits to overwrite.
|
||||
This must be no more than 8.*/
|
||||
void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
|
||||
int shift;
|
||||
unsigned mask;
|
||||
OD_ASSERT(nbits >= 0);
|
||||
OD_ASSERT(nbits <= 8);
|
||||
OD_ASSERT(val < 1U << nbits);
|
||||
shift = 8 - nbits;
|
||||
mask = ((1U << nbits) - 1) << shift;
|
||||
if (enc->offs > 0) {
|
||||
/*The first byte has been finalized.*/
|
||||
enc->precarry_buf[0] =
|
||||
(uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
|
||||
} else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
|
||||
/*The first byte has yet to be output.*/
|
||||
enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
|
||||
(od_ec_window)val << (16 + enc->cnt + shift);
|
||||
} else {
|
||||
/*The encoder hasn't even encoded _nbits of data yet.*/
|
||||
enc->error = -1;
|
||||
}
|
||||
}
|
||||
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
/*Indicates that there are no more symbols to encode.
|
||||
All remaining output bytes are flushed to the output buffer.
|
||||
od_ec_enc_reset() should be called before using the encoder again.
|
||||
bytes: Returns the size of the encoded data in the returned buffer.
|
||||
Return: A pointer to the start of the final buffer, or NULL if there was an
|
||||
encoding error.*/
|
||||
unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
|
||||
unsigned char *out;
|
||||
uint32_t storage;
|
||||
uint16_t *buf;
|
||||
uint32_t offs;
|
||||
uint32_t end_offs;
|
||||
int nend_bits;
|
||||
od_ec_window m;
|
||||
od_ec_window e;
|
||||
od_ec_window l;
|
||||
unsigned r;
|
||||
int c;
|
||||
int s;
|
||||
if (enc->error) return NULL;
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
{
|
||||
uint32_t tell;
|
||||
/* Don't count the 1 bit we lose to raw bits as overhead. */
|
||||
tell = od_ec_enc_tell(enc) - 1;
|
||||
fprintf(stderr, "overhead: %f%%\n",
|
||||
100 * (tell - enc->entropy) / enc->entropy);
|
||||
fprintf(stderr, "efficiency: %f bits/symbol\n",
|
||||
(double)tell / enc->nb_symbols);
|
||||
}
|
||||
#endif
|
||||
/*We output the minimum number of bits that ensures that the symbols encoded
|
||||
thus far will be decoded correctly regardless of the bits that follow.*/
|
||||
l = enc->low;
|
||||
r = enc->rng;
|
||||
c = enc->cnt;
|
||||
s = 9;
|
||||
m = 0x7FFF;
|
||||
e = (l + m) & ~m;
|
||||
while ((e | m) >= l + r) {
|
||||
s++;
|
||||
m >>= 1;
|
||||
e = (l + m) & ~m;
|
||||
}
|
||||
s += c;
|
||||
offs = enc->offs;
|
||||
buf = enc->precarry_buf;
|
||||
if (s > 0) {
|
||||
unsigned n;
|
||||
storage = enc->precarry_storage;
|
||||
if (offs + ((s + 7) >> 3) > storage) {
|
||||
storage = storage * 2 + ((s + 7) >> 3);
|
||||
buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
|
||||
if (buf == NULL) {
|
||||
enc->error = -1;
|
||||
return NULL;
|
||||
}
|
||||
enc->precarry_buf = buf;
|
||||
enc->precarry_storage = storage;
|
||||
}
|
||||
n = (1 << (c + 16)) - 1;
|
||||
do {
|
||||
OD_ASSERT(offs < storage);
|
||||
buf[offs++] = (uint16_t)(e >> (c + 16));
|
||||
e &= n;
|
||||
s -= 8;
|
||||
c -= 8;
|
||||
n >>= 8;
|
||||
} while (s > 0);
|
||||
}
|
||||
/*Make sure there's enough room for the entropy-coded bits and the raw
|
||||
bits.*/
|
||||
out = enc->buf;
|
||||
storage = enc->storage;
|
||||
end_offs = enc->end_offs;
|
||||
e = enc->end_window;
|
||||
nend_bits = enc->nend_bits;
|
||||
s = -s;
|
||||
c = OD_MAXI((nend_bits - s + 7) >> 3, 0);
|
||||
if (offs + end_offs + c > storage) {
|
||||
storage = offs + end_offs + c;
|
||||
out = (unsigned char *)realloc(out, sizeof(*out) * storage);
|
||||
if (out == NULL) {
|
||||
enc->error = -1;
|
||||
return NULL;
|
||||
}
|
||||
OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs);
|
||||
enc->buf = out;
|
||||
enc->storage = storage;
|
||||
}
|
||||
/*If we have buffered raw bits, flush them as well.*/
|
||||
while (nend_bits > s) {
|
||||
OD_ASSERT(end_offs < storage);
|
||||
out[storage - ++end_offs] = (unsigned char)e;
|
||||
e >>= 8;
|
||||
nend_bits -= 8;
|
||||
}
|
||||
*nbytes = offs + end_offs;
|
||||
/*Perform carry propagation.*/
|
||||
OD_ASSERT(offs + end_offs <= storage);
|
||||
out = out + storage - (offs + end_offs);
|
||||
c = 0;
|
||||
end_offs = offs;
|
||||
while (offs-- > 0) {
|
||||
c = buf[offs] + c;
|
||||
out[offs] = (unsigned char)c;
|
||||
c >>= 8;
|
||||
}
|
||||
/*Add any remaining raw bits to the last byte.
|
||||
There is guaranteed to be enough room, because nend_bits <= s.*/
|
||||
OD_ASSERT(nend_bits <= 0 || end_offs > 0);
|
||||
if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e;
|
||||
/*Note: Unless there's an allocation error, if you keep encoding into the
|
||||
current buffer and call this function again later, everything will work
|
||||
just fine (you won't get a new packet out, but you will get a single
|
||||
buffer with the new data appended to the old).
|
||||
However, this function is O(N) where N is the amount of data coded so far,
|
||||
so calling it more than once for a given packet is a bad idea.*/
|
||||
return out;
|
||||
}
|
||||
|
||||
/*Returns the number of bits "used" by the encoded symbols so far.
|
||||
This same number can be computed in either the encoder or the decoder, and is
|
||||
suitable for making coding decisions.
|
||||
Warning: The value returned by this function can decrease compared to an
|
||||
earlier call, even after encoding more data, if there is an encoding error
|
||||
(i.e., a failure to allocate enough space for the output buffer).
|
||||
Return: The number of bits.
|
||||
This will always be slightly larger than the exact value (e.g., all
|
||||
rounding error is in the positive direction).*/
|
||||
int od_ec_enc_tell(const od_ec_enc *enc) {
|
||||
/*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
|
||||
bit, which we reserve for terminating the stream.*/
|
||||
return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10;
|
||||
}
|
||||
|
||||
/*Returns the number of bits "used" by the encoded symbols so far.
|
||||
This same number can be computed in either the encoder or the decoder, and is
|
||||
suitable for making coding decisions.
|
||||
Warning: The value returned by this function can decrease compared to an
|
||||
earlier call, even after encoding more data, if there is an encoding error
|
||||
(i.e., a failure to allocate enough space for the output buffer).
|
||||
Return: The number of bits scaled by 2**OD_BITRES.
|
||||
This will always be slightly larger than the exact value (e.g., all
|
||||
rounding error is in the positive direction).*/
|
||||
uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
|
||||
return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
|
||||
}
|
||||
|
||||
/*Saves a entropy coder checkpoint to dst.
|
||||
This allows an encoder to reverse a series of entropy coder
|
||||
decisions if it decides that the information would have been
|
||||
better coded some other way.*/
|
||||
void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) {
|
||||
OD_COPY(dst, src, 1);
|
||||
}
|
||||
|
||||
/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint.
|
||||
This can only be used to restore from checkpoints earlier in the target
|
||||
state's history: you can not switch backwards and forwards or otherwise
|
||||
switch to a state which isn't a casual ancestor of the current state.
|
||||
Restore is also incompatible with patching the initial bits, as the
|
||||
changes will remain in the restored version.*/
|
||||
void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
|
||||
unsigned char *buf;
|
||||
uint32_t storage;
|
||||
uint16_t *precarry_buf;
|
||||
uint32_t precarry_storage;
|
||||
OD_ASSERT(dst->storage >= src->storage);
|
||||
OD_ASSERT(dst->precarry_storage >= src->precarry_storage);
|
||||
buf = dst->buf;
|
||||
storage = dst->storage;
|
||||
precarry_buf = dst->precarry_buf;
|
||||
precarry_storage = dst->precarry_storage;
|
||||
OD_COPY(dst, src, 1);
|
||||
dst->buf = buf;
|
||||
dst->storage = storage;
|
||||
dst->precarry_buf = precarry_buf;
|
||||
dst->precarry_storage = precarry_storage;
|
||||
}
|
||||
103
aom_dsp/entenc.h
103
aom_dsp/entenc.h
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#if !defined(_entenc_H)
|
||||
#define _entenc_H (1)
|
||||
#include <stddef.h>
|
||||
#include "aom_dsp/entcode.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct od_ec_enc od_ec_enc;
|
||||
|
||||
#define OD_MEASURE_EC_OVERHEAD (0)
|
||||
|
||||
/*The entropy encoder context.*/
|
||||
struct od_ec_enc {
|
||||
/*Buffered output.
|
||||
This contains only the raw bits until the final call to od_ec_enc_done(),
|
||||
where all the arithmetic-coded data gets prepended to it.*/
|
||||
unsigned char *buf;
|
||||
/*The size of the buffer.*/
|
||||
uint32_t storage;
|
||||
/*The offset at which the last byte containing raw bits was written.*/
|
||||
uint32_t end_offs;
|
||||
/*Bits that will be read from/written at the end.*/
|
||||
od_ec_window end_window;
|
||||
/*Number of valid bits in end_window.*/
|
||||
int nend_bits;
|
||||
/*A buffer for output bytes with their associated carry flags.*/
|
||||
uint16_t *precarry_buf;
|
||||
/*The size of the pre-carry buffer.*/
|
||||
uint32_t precarry_storage;
|
||||
/*The offset at which the next entropy-coded byte will be written.*/
|
||||
uint32_t offs;
|
||||
/*The low end of the current range.*/
|
||||
od_ec_window low;
|
||||
/*The number of values in the current range.*/
|
||||
uint16_t rng;
|
||||
/*The number of bits of data in the current value.*/
|
||||
int16_t cnt;
|
||||
/*Nonzero if an error occurred.*/
|
||||
int error;
|
||||
#if OD_MEASURE_EC_OVERHEAD
|
||||
double entropy;
|
||||
int nb_symbols;
|
||||
#endif
|
||||
};
|
||||
|
||||
/*See entenc.c for further documentation.*/
|
||||
|
||||
void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
|
||||
void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
|
||||
void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
|
||||
|
||||
void od_ec_encode_bool(od_ec_enc *enc, int val, unsigned fz, unsigned _ft)
|
||||
OD_ARG_NONNULL(1);
|
||||
void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned fz_q15)
|
||||
OD_ARG_NONNULL(1);
|
||||
void od_ec_encode_cdf(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
|
||||
OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
|
||||
void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
|
||||
OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
|
||||
void od_ec_encode_cdf_unscaled(od_ec_enc *enc, int s, const uint16_t *cdf,
|
||||
int nsyms) OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
|
||||
void od_ec_encode_cdf_unscaled_dyadic(od_ec_enc *enc, int s,
|
||||
const uint16_t *cdf, int nsyms,
|
||||
unsigned ftb) OD_ARG_NONNULL(1)
|
||||
OD_ARG_NONNULL(3);
|
||||
|
||||
void od_ec_enc_uint(od_ec_enc *enc, uint32_t fl, uint32_t ft) OD_ARG_NONNULL(1);
|
||||
|
||||
void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
|
||||
OD_ARG_NONNULL(1);
|
||||
|
||||
void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
|
||||
OD_ARG_NONNULL(1);
|
||||
OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
|
||||
uint32_t *nbytes)
|
||||
OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
|
||||
|
||||
OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
|
||||
OD_ARG_NONNULL(1);
|
||||
OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
|
||||
OD_ARG_NONNULL(1);
|
||||
|
||||
void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
|
||||
void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,26 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_FWD_TXFM_H_
|
||||
#define AOM_DSP_FWD_TXFM_H_
|
||||
|
||||
#include "aom_dsp/txfm_common.h"
|
||||
|
||||
static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
|
||||
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
|
||||
// TODO(debargha, peter.derivaz): Find new bounds for this assert
|
||||
// and make the bounds consts.
|
||||
// assert(INT16_MIN <= rv && rv <= INT16_MAX);
|
||||
return rv;
|
||||
}
|
||||
|
||||
void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round);
|
||||
#endif // AOM_DSP_FWD_TXFM_H_
|
||||
@@ -1,31 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "aom_dsp/mips/common_dspr2.h"
|
||||
|
||||
#if HAVE_DSPR2
|
||||
uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
|
||||
uint8_t *aom_ff_cropTbl;
|
||||
|
||||
void aom_dsputil_static_init(void) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
|
||||
|
||||
for (i = 0; i < CROP_WIDTH; i++) {
|
||||
aom_ff_cropTbl_a[i] = 0;
|
||||
aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
|
||||
}
|
||||
|
||||
aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
|
||||
}
|
||||
|
||||
#endif
|
||||
226
aom_dsp/prob.c
226
aom_dsp/prob.c
@@ -1,226 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "./aom_config.h"
|
||||
|
||||
#if CONFIG_EC_MULTISYMBOL
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "aom_dsp/prob.h"
|
||||
|
||||
#if CONFIG_DAALA_EC
|
||||
#include "aom_dsp/entcode.h"
|
||||
#endif
|
||||
|
||||
const uint8_t aom_norm[256] = {
|
||||
0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
static unsigned int tree_merge_probs_impl(unsigned int i,
|
||||
const aom_tree_index *tree,
|
||||
const aom_prob *pre_probs,
|
||||
const unsigned int *counts,
|
||||
aom_prob *probs) {
|
||||
const int l = tree[i];
|
||||
const unsigned int left_count =
|
||||
(l <= 0) ? counts[-l]
|
||||
: tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
|
||||
const int r = tree[i + 1];
|
||||
const unsigned int right_count =
|
||||
(r <= 0) ? counts[-r]
|
||||
: tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
|
||||
const unsigned int ct[2] = { left_count, right_count };
|
||||
probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
|
||||
return left_count + right_count;
|
||||
}
|
||||
|
||||
void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
|
||||
const unsigned int *counts, aom_prob *probs) {
|
||||
tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
|
||||
}
|
||||
|
||||
#if CONFIG_EC_MULTISYMBOL
|
||||
typedef struct tree_node tree_node;
|
||||
|
||||
struct tree_node {
|
||||
aom_tree_index index;
|
||||
uint8_t probs[16];
|
||||
uint8_t prob;
|
||||
int path;
|
||||
int len;
|
||||
int l;
|
||||
int r;
|
||||
aom_cdf_prob pdf;
|
||||
};
|
||||
|
||||
/* Compute the probability of this node in Q23 */
|
||||
static uint32_t tree_node_prob(tree_node n, int i) {
|
||||
uint32_t prob;
|
||||
/* 1.0 in Q23 */
|
||||
prob = 16777216;
|
||||
for (; i < n.len; i++) {
|
||||
prob = prob * n.probs[i] >> 8;
|
||||
}
|
||||
return prob;
|
||||
}
|
||||
|
||||
static int tree_node_cmp(tree_node a, tree_node b) {
|
||||
int i;
|
||||
uint32_t pa;
|
||||
uint32_t pb;
|
||||
for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
|
||||
}
|
||||
pa = tree_node_prob(a, i);
|
||||
pb = tree_node_prob(b, i);
|
||||
return pa > pb ? 1 : pa < pb ? -1 : 0;
|
||||
}
|
||||
|
||||
/* Given a Q15 probability for symbol subtree rooted at tree[n], this function
|
||||
computes the probability of each symbol (defined as a node that has no
|
||||
children). */
|
||||
static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n,
|
||||
aom_cdf_prob pdf) {
|
||||
if (tree[n].l == 0) {
|
||||
/* This prevents probability computations in Q15 that underflow from
|
||||
producing a symbol that has zero probability. */
|
||||
if (pdf == 0) pdf = 1;
|
||||
tree[n].pdf = pdf;
|
||||
return pdf;
|
||||
} else {
|
||||
/* We process the smaller probability first, */
|
||||
if (tree[n].prob < 128) {
|
||||
aom_cdf_prob lp;
|
||||
aom_cdf_prob rp;
|
||||
lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8;
|
||||
lp = tree_node_compute_probs(tree, tree[n].l, lp);
|
||||
rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp);
|
||||
return lp + rp;
|
||||
} else {
|
||||
aom_cdf_prob rp;
|
||||
aom_cdf_prob lp;
|
||||
rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8;
|
||||
rp = tree_node_compute_probs(tree, tree[n].r, rp);
|
||||
lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp);
|
||||
return lp + rp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int tree_node_extract(tree_node *tree, int n, int symb,
|
||||
aom_cdf_prob *pdf, aom_tree_index *index,
|
||||
int *path, int *len) {
|
||||
if (tree[n].l == 0) {
|
||||
pdf[symb] = tree[n].pdf;
|
||||
if (index != NULL) index[symb] = tree[n].index;
|
||||
if (path != NULL) path[symb] = tree[n].path;
|
||||
if (len != NULL) len[symb] = tree[n].len;
|
||||
return symb + 1;
|
||||
} else {
|
||||
symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len);
|
||||
return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len);
|
||||
}
|
||||
}
|
||||
|
||||
int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
|
||||
aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index,
|
||||
int *path, int *len) {
|
||||
tree_node symb[2 * 16 - 1];
|
||||
int nodes;
|
||||
int next[16];
|
||||
int size;
|
||||
int nsymbs;
|
||||
int i;
|
||||
/* Create the root node with probability 1 in Q15. */
|
||||
symb[0].index = root;
|
||||
symb[0].path = 0;
|
||||
symb[0].len = 0;
|
||||
symb[0].l = symb[0].r = 0;
|
||||
nodes = 1;
|
||||
next[0] = 0;
|
||||
size = 1;
|
||||
nsymbs = 1;
|
||||
while (size > 0 && nsymbs < 16) {
|
||||
int m;
|
||||
tree_node n;
|
||||
aom_tree_index j;
|
||||
uint8_t prob;
|
||||
m = 0;
|
||||
/* Find the internal node with the largest probability. */
|
||||
for (i = 1; i < size; i++) {
|
||||
if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i;
|
||||
}
|
||||
i = next[m];
|
||||
memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1)));
|
||||
size--;
|
||||
/* Split this symbol into two symbols */
|
||||
n = symb[i];
|
||||
j = n.index;
|
||||
prob = probs[j >> 1];
|
||||
/* Left */
|
||||
n.index = tree[j];
|
||||
n.path <<= 1;
|
||||
n.len++;
|
||||
n.probs[n.len - 1] = prob;
|
||||
symb[nodes] = n;
|
||||
if (n.index > 0) {
|
||||
next[size++] = nodes;
|
||||
}
|
||||
/* Right */
|
||||
n.index = tree[j + 1];
|
||||
n.path += 1;
|
||||
n.probs[n.len - 1] = 256 - prob;
|
||||
symb[nodes + 1] = n;
|
||||
if (n.index > 0) {
|
||||
next[size++] = nodes + 1;
|
||||
}
|
||||
symb[i].prob = prob;
|
||||
symb[i].l = nodes;
|
||||
symb[i].r = nodes + 1;
|
||||
nodes += 2;
|
||||
nsymbs++;
|
||||
}
|
||||
/* Compute the probabilities of each symbol in Q15 */
|
||||
tree_node_compute_probs(symb, 0, 32768);
|
||||
/* Extract the cdf, index, path and length */
|
||||
tree_node_extract(symb, 0, 0, cdf, index, path, len);
|
||||
/* Convert to CDF */
|
||||
for (i = 1; i < nsymbs; i++) {
|
||||
cdf[i] = cdf[i - 1] + cdf[i];
|
||||
}
|
||||
return nsymbs;
|
||||
}
|
||||
|
||||
/* This code assumes that tree contains as unique leaf nodes the integer values
|
||||
0 to len - 1 and produces the forward and inverse mapping tables in ind[]
|
||||
and inv[] respectively. */
|
||||
void av1_indices_from_tree(int *ind, int *inv, int len,
|
||||
const aom_tree_index *tree) {
|
||||
int i;
|
||||
int index;
|
||||
for (i = index = 0; i < TREE_SIZE(len); i++) {
|
||||
const aom_tree_index j = tree[i];
|
||||
if (j <= 0) {
|
||||
inv[index] = -j;
|
||||
ind[-j] = index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
158
aom_dsp/prob.h
158
aom_dsp/prob.h
@@ -1,158 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_PROB_H_
|
||||
#define AOM_DSP_PROB_H_
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./aom_dsp_common.h"
|
||||
|
||||
#include "aom_ports/bitops.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef uint8_t aom_prob;
|
||||
|
||||
// TODO(negge): Rename this aom_prob once we remove vpxbool.
|
||||
typedef uint16_t aom_cdf_prob;
|
||||
|
||||
#define MAX_PROB 255
|
||||
|
||||
#define aom_prob_half ((aom_prob)128)
|
||||
|
||||
typedef int8_t aom_tree_index;
|
||||
|
||||
#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
|
||||
|
||||
#define aom_complement(x) (255 - x)
|
||||
|
||||
#define MODE_MV_COUNT_SAT 20
|
||||
|
||||
/* We build coding trees compactly in arrays.
|
||||
Each node of the tree is a pair of aom_tree_indices.
|
||||
Array index often references a corresponding probability table.
|
||||
Index <= 0 means done encoding/decoding and value = -Index,
|
||||
Index > 0 means need another bit, specification at index.
|
||||
Nonnegative indices are always even; processing begins at node 0. */
|
||||
|
||||
typedef const aom_tree_index aom_tree[];
|
||||
|
||||
static INLINE aom_prob clip_prob(int p) {
|
||||
return (p > 255) ? 255 : (p < 1) ? 1 : p;
|
||||
}
|
||||
|
||||
static INLINE aom_prob get_prob(int num, int den) {
|
||||
return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
|
||||
}
|
||||
|
||||
static INLINE aom_prob get_binary_prob(int n0, int n1) {
|
||||
return get_prob(n0, n0 + n1);
|
||||
}
|
||||
|
||||
/* This function assumes prob1 and prob2 are already within [1,255] range. */
|
||||
static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) {
|
||||
return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
|
||||
}
|
||||
|
||||
static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2],
|
||||
unsigned int count_sat,
|
||||
unsigned int max_update_factor) {
|
||||
const aom_prob prob = get_binary_prob(ct[0], ct[1]);
|
||||
const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat);
|
||||
const unsigned int factor = max_update_factor * count / count_sat;
|
||||
return weighted_prob(pre_prob, prob, factor);
|
||||
}
|
||||
|
||||
// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
|
||||
static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
|
||||
0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
|
||||
70, 76, 83, 89, 96, 102, 108, 115, 121, 128
|
||||
};
|
||||
|
||||
static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob,
|
||||
const unsigned int ct[2]) {
|
||||
const unsigned int den = ct[0] + ct[1];
|
||||
if (den == 0) {
|
||||
return pre_prob;
|
||||
} else {
|
||||
const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT);
|
||||
const unsigned int factor = count_to_update_factor[count];
|
||||
const aom_prob prob =
|
||||
clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
|
||||
return weighted_prob(pre_prob, prob, factor);
|
||||
}
|
||||
}
|
||||
|
||||
void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
|
||||
const unsigned int *counts, aom_prob *probs);
|
||||
|
||||
#if CONFIG_EC_MULTISYMBOL
|
||||
int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
|
||||
aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
|
||||
int *pth, int *len);
|
||||
|
||||
static INLINE void av1_tree_to_cdf(const aom_tree_index *tree,
|
||||
const aom_prob *probs, aom_cdf_prob *cdf) {
|
||||
aom_tree_index index[16];
|
||||
int path[16];
|
||||
int dist[16];
|
||||
tree_to_cdf(tree, probs, 0, cdf, index, path, dist);
|
||||
}
|
||||
|
||||
#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \
|
||||
do { \
|
||||
int i; \
|
||||
for (i = 0; i < u; i++) { \
|
||||
av1_tree_to_cdf(tree, probs[i], cdf[i]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u) \
|
||||
do { \
|
||||
int j; \
|
||||
int i; \
|
||||
for (j = 0; j < v; j++) { \
|
||||
for (i = 0; i < u; i++) { \
|
||||
av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
void av1_indices_from_tree(int *ind, int *inv, int len,
|
||||
const aom_tree_index *tree);
|
||||
#endif
|
||||
|
||||
DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]);
|
||||
|
||||
#if CONFIG_EC_ADAPT
|
||||
static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
|
||||
const int rate = 4 + get_msb(nsymbs);
|
||||
int i, diff, tmp;
|
||||
for (i = 0; i < nsymbs; ++i) {
|
||||
tmp = (i + 1) << (12 - rate);
|
||||
cdf[i] -= ((cdf[i] - tmp) >> rate);
|
||||
}
|
||||
diff = 32768 - cdf[nsymbs - 1];
|
||||
|
||||
for (i = val; i < nsymbs; ++i) {
|
||||
cdf[i] += diff;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_PROB_H_
|
||||
@@ -1,67 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_PSNR_H_
|
||||
#define AOM_DSP_PSNR_H_
|
||||
|
||||
#include "aom_scale/yv12config.h"
|
||||
|
||||
#define MAX_PSNR 100.0
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
double psnr[4]; // total/y/u/v
|
||||
uint64_t sse[4]; // total/y/u/v
|
||||
uint32_t samples[4]; // total/y/u/v
|
||||
} PSNR_STATS;
|
||||
|
||||
/*!\brief Converts SSE to PSNR
|
||||
*
|
||||
* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
|
||||
*
|
||||
* \param[in] samples Number of samples
|
||||
* \param[in] peak Max sample value
|
||||
* \param[in] sse Sum of squared errors
|
||||
*/
|
||||
double aom_sse_to_psnr(double samples, double peak, double sse);
|
||||
int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
|
||||
const YV12_BUFFER_CONFIG *b, int hstart, int width,
|
||||
int vstart, int height);
|
||||
int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
|
||||
int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
|
||||
int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
|
||||
const YV12_BUFFER_CONFIG *b, int hstart,
|
||||
int width, int vstart, int height);
|
||||
int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
|
||||
const YV12_BUFFER_CONFIG *b);
|
||||
int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
|
||||
const YV12_BUFFER_CONFIG *b);
|
||||
int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
|
||||
const YV12_BUFFER_CONFIG *b);
|
||||
void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
|
||||
const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
|
||||
unsigned int bit_depth, unsigned int in_bit_depth);
|
||||
#endif
|
||||
void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
|
||||
PSNR_STATS *psnr);
|
||||
|
||||
double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
|
||||
const YV12_BUFFER_CONFIG *dest, double *phvs_y,
|
||||
double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
#endif // AOM_DSP_PSNR_H_
|
||||
@@ -1,686 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "aom_dsp/quantize.h"
|
||||
#include "aom_mem/aom_mem.h"
|
||||
|
||||
#if CONFIG_AOM_QM
|
||||
void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr,
|
||||
const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
|
||||
const int rc = 0;
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
int64_t tmp, eob = -1;
|
||||
int32_t tmp32;
|
||||
int dequant =
|
||||
(dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
|
||||
tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS));
|
||||
qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
|
||||
if (tmp32) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
|
||||
int skip_block, const int16_t *round_ptr,
|
||||
const int16_t quant, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
|
||||
uint16_t *eob_ptr, const qm_val_t *qm_ptr,
|
||||
const qm_val_t *iqm_ptr) {
|
||||
int eob = -1;
|
||||
int dequant =
|
||||
(dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
const int coeff = coeff_ptr[0];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
const int64_t tmp = abs_coeff + round_ptr[0];
|
||||
const uint32_t abs_qcoeff =
|
||||
(uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS));
|
||||
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
|
||||
if (abs_qcoeff) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr,
|
||||
const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
|
||||
const int n_coeffs = 1024;
|
||||
const int rc = 0;
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
int64_t tmp, eob = -1;
|
||||
int32_t tmp32;
|
||||
int dequant;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
|
||||
INT16_MIN, INT16_MAX);
|
||||
tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS));
|
||||
qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
|
||||
dequant =
|
||||
(dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
|
||||
dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
|
||||
if (tmp32) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr,
|
||||
const qm_val_t *qm_ptr,
|
||||
const qm_val_t *iqm_ptr) {
|
||||
const int n_coeffs = 1024;
|
||||
int eob = -1;
|
||||
int dequant;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
const int coeff = coeff_ptr[0];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
|
||||
const uint32_t abs_qcoeff =
|
||||
(uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS));
|
||||
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dequant =
|
||||
(dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
|
||||
dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2;
|
||||
if (abs_qcoeff) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
|
||||
uint16_t *eob_ptr, const int16_t *scan,
|
||||
const int16_t *iscan, const qm_val_t *qm_ptr,
|
||||
const qm_val_t *iqm_ptr) {
|
||||
int i, non_zero_count = (int)n_coeffs, eob = -1;
|
||||
const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = (int)n_coeffs - 1; i >= 0; i--) {
|
||||
const int rc = scan[i];
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
const int coeff = coeff_ptr[rc] * wt;
|
||||
|
||||
if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
|
||||
coeff > (nzbins[rc != 0] << AOM_QM_BITS))
|
||||
non_zero_count--;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
// Quantization pass: All coefficients with index >= zero_flag are
|
||||
// skippable. Note: zero_flag can be zero.
|
||||
for (i = 0; i < non_zero_count; i++) {
|
||||
const int rc = scan[i];
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
int dequant;
|
||||
|
||||
if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
|
||||
int32_t tmp32;
|
||||
int64_t tmp =
|
||||
clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
|
||||
tmp = tmp * wt;
|
||||
tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
|
||||
quant_shift_ptr[rc != 0]) >>
|
||||
(16 + AOM_QM_BITS); // quantization
|
||||
dequant =
|
||||
(dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
|
||||
AOM_QM_BITS;
|
||||
qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
|
||||
|
||||
if (tmp32) eob = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan,
|
||||
const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
|
||||
int i, non_zero_count = (int)n_coeffs, eob = -1;
|
||||
const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
int dequant;
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = (int)n_coeffs - 1; i >= 0; i--) {
|
||||
const int rc = scan[i];
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
const int coeff = coeff_ptr[rc] * wt;
|
||||
|
||||
if (coeff < (zbins[rc != 0] << AOM_QM_BITS) &&
|
||||
coeff > (nzbins[rc != 0] << AOM_QM_BITS))
|
||||
non_zero_count--;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
// Quantization pass: All coefficients with index >= zero_flag are
|
||||
// skippable. Note: zero_flag can be zero.
|
||||
for (i = 0; i < non_zero_count; i++) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
|
||||
if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
|
||||
const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
|
||||
const int64_t tmpw = tmp1 * wt;
|
||||
const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
|
||||
const uint32_t abs_qcoeff =
|
||||
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS));
|
||||
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dequant =
|
||||
(dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
|
||||
AOM_QM_BITS;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
|
||||
if (abs_qcoeff) eob = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan,
|
||||
const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) {
|
||||
const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
|
||||
ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
|
||||
int idx = 0;
|
||||
int idx_arr[1024];
|
||||
int i, eob = -1;
|
||||
int dequant;
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = 0; i < n_coeffs; i++) {
|
||||
const int rc = scan[i];
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
const int coeff = coeff_ptr[rc] * wt;
|
||||
|
||||
// If the coefficient is out of the base ZBIN range, keep it for
|
||||
// quantization.
|
||||
if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
|
||||
coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
|
||||
idx_arr[idx++] = i;
|
||||
}
|
||||
|
||||
// Quantization pass: only process the coefficients selected in
|
||||
// pre-scan pass. Note: idx can be zero.
|
||||
for (i = 0; i < idx; i++) {
|
||||
const int rc = scan[idx_arr[i]];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
int64_t tmp;
|
||||
int tmp32;
|
||||
int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
|
||||
tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
|
||||
tmp = tmp * wt;
|
||||
tmp32 = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
|
||||
quant_shift_ptr[rc != 0]) >>
|
||||
(15 + AOM_QM_BITS);
|
||||
|
||||
qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
|
||||
dequant =
|
||||
(dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
|
||||
AOM_QM_BITS;
|
||||
dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
|
||||
|
||||
if (tmp32) eob = idx_arr[i];
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_b_32x32_c(
|
||||
const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
|
||||
const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
|
||||
const qm_val_t *iqm_ptr) {
|
||||
const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
|
||||
ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
|
||||
int idx = 0;
|
||||
int idx_arr[1024];
|
||||
int i, eob = -1;
|
||||
int dequant;
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = 0; i < n_coeffs; i++) {
|
||||
const int rc = scan[i];
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
const int coeff = coeff_ptr[rc] * wt;
|
||||
|
||||
// If the coefficient is out of the base ZBIN range, keep it for
|
||||
// quantization.
|
||||
if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) ||
|
||||
coeff <= (nzbins[rc != 0] << AOM_QM_BITS))
|
||||
idx_arr[idx++] = i;
|
||||
}
|
||||
|
||||
// Quantization pass: only process the coefficients selected in
|
||||
// pre-scan pass. Note: idx can be zero.
|
||||
for (i = 0; i < idx; i++) {
|
||||
const int rc = scan[idx_arr[i]];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const qm_val_t wt = qm_ptr[rc];
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
const int64_t tmp1 =
|
||||
abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
|
||||
const int64_t tmpw = tmp1 * wt;
|
||||
const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
|
||||
const uint32_t abs_qcoeff =
|
||||
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS));
|
||||
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dequant =
|
||||
(dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >>
|
||||
AOM_QM_BITS;
|
||||
dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
|
||||
if (abs_qcoeff) eob = idx_arr[i];
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr) {
|
||||
const int rc = 0;
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
int tmp, eob = -1;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
|
||||
tmp = (tmp * quant) >> 16;
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
|
||||
if (tmp) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
|
||||
int skip_block, const int16_t *round_ptr,
|
||||
const int16_t quant, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
|
||||
uint16_t *eob_ptr) {
|
||||
int eob = -1;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
const int coeff = coeff_ptr[0];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
const int64_t tmp = abs_coeff + round_ptr[0];
|
||||
const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
|
||||
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
|
||||
if (abs_qcoeff) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr) {
|
||||
const int n_coeffs = 1024;
|
||||
const int rc = 0;
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
int tmp, eob = -1;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
|
||||
INT16_MIN, INT16_MAX);
|
||||
tmp = (tmp * quant) >> 15;
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
|
||||
if (tmp) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr,
|
||||
uint16_t *eob_ptr) {
|
||||
const int n_coeffs = 1024;
|
||||
int eob = -1;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
const int coeff = coeff_ptr[0];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
|
||||
const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
|
||||
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
|
||||
if (abs_qcoeff) eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
|
||||
uint16_t *eob_ptr, const int16_t *scan,
|
||||
const int16_t *iscan) {
|
||||
int i, non_zero_count = (int)n_coeffs, eob = -1;
|
||||
const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = (int)n_coeffs - 1; i >= 0; i--) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
|
||||
if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
|
||||
non_zero_count--;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
// Quantization pass: All coefficients with index >= zero_flag are
|
||||
// skippable. Note: zero_flag can be zero.
|
||||
for (i = 0; i < non_zero_count; i++) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
|
||||
if (abs_coeff >= zbins[rc != 0]) {
|
||||
int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
|
||||
tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
|
||||
quant_shift_ptr[rc != 0]) >>
|
||||
16; // quantization
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
|
||||
|
||||
if (tmp) eob = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan) {
|
||||
int i, non_zero_count = (int)n_coeffs, eob = -1;
|
||||
const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = (int)n_coeffs - 1; i >= 0; i--) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
|
||||
if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
|
||||
non_zero_count--;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
// Quantization pass: All coefficients with index >= zero_flag are
|
||||
// skippable. Note: zero_flag can be zero.
|
||||
for (i = 0; i < non_zero_count; i++) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
|
||||
if (abs_coeff >= zbins[rc != 0]) {
|
||||
const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
|
||||
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
|
||||
const uint32_t abs_qcoeff =
|
||||
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
|
||||
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
|
||||
if (abs_qcoeff) eob = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan) {
|
||||
const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
|
||||
ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
|
||||
int idx = 0;
|
||||
int idx_arr[1024];
|
||||
int i, eob = -1;
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = 0; i < n_coeffs; i++) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
|
||||
// If the coefficient is out of the base ZBIN range, keep it for
|
||||
// quantization.
|
||||
if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
|
||||
idx_arr[idx++] = i;
|
||||
}
|
||||
|
||||
// Quantization pass: only process the coefficients selected in
|
||||
// pre-scan pass. Note: idx can be zero.
|
||||
for (i = 0; i < idx; i++) {
|
||||
const int rc = scan[idx_arr[i]];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
int tmp;
|
||||
int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
|
||||
abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
|
||||
tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
|
||||
quant_shift_ptr[rc != 0]) >>
|
||||
15;
|
||||
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
|
||||
|
||||
if (tmp) eob = idx_arr[i];
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_b_32x32_c(
|
||||
const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
|
||||
const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan) {
|
||||
const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
|
||||
ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
|
||||
const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
|
||||
|
||||
int idx = 0;
|
||||
int idx_arr[1024];
|
||||
int i, eob = -1;
|
||||
(void)iscan;
|
||||
|
||||
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
|
||||
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
|
||||
|
||||
if (!skip_block) {
|
||||
// Pre-scan pass
|
||||
for (i = 0; i < n_coeffs; i++) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
|
||||
// If the coefficient is out of the base ZBIN range, keep it for
|
||||
// quantization.
|
||||
if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
|
||||
idx_arr[idx++] = i;
|
||||
}
|
||||
|
||||
// Quantization pass: only process the coefficients selected in
|
||||
// pre-scan pass. Note: idx can be zero.
|
||||
for (i = 0; i < idx; i++) {
|
||||
const int rc = scan[idx_arr[i]];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
const int64_t tmp1 =
|
||||
abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
|
||||
const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
|
||||
const uint32_t abs_qcoeff =
|
||||
(uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
|
||||
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
|
||||
if (abs_qcoeff) eob = idx_arr[i];
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@@ -1,91 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_QUANTIZE_H_
|
||||
#define AOM_DSP_QUANTIZE_H_
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if CONFIG_AOM_QM
|
||||
void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr,
|
||||
const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
|
||||
void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr,
|
||||
const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
|
||||
void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
|
||||
uint16_t *eob_ptr, const int16_t *scan,
|
||||
const int16_t *iscan, const qm_val_t *qm_ptr,
|
||||
const qm_val_t *iqm_ptr);
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
|
||||
int skip_block, const int16_t *round_ptr,
|
||||
const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
|
||||
uint16_t *eob_ptr, const qm_val_t *qm_ptr,
|
||||
const qm_val_t *iqm_ptr);
|
||||
void aom_highbd_quantize_dc_32x32(
|
||||
const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr,
|
||||
const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr,
|
||||
const qm_val_t *iqm_ptr);
|
||||
void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
|
||||
int skip_block, const int16_t *zbin_ptr,
|
||||
const int16_t *round_ptr, const int16_t *quant_ptr,
|
||||
const int16_t *quant_shift_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan,
|
||||
const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr);
|
||||
#endif
|
||||
#else
|
||||
void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr);
|
||||
void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant_ptr,
|
||||
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr);
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
|
||||
int skip_block, const int16_t *round_ptr,
|
||||
const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
|
||||
uint16_t *eob_ptr);
|
||||
void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr,
|
||||
const int16_t quant_ptr,
|
||||
tran_low_t *qcoeff_ptr,
|
||||
tran_low_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_QUANTIZE_H_
|
||||
512
aom_dsp/sad.c
512
aom_dsp/sad.c
@@ -1,512 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
|
||||
/* Sum the difference between every corresponding element of the buffers. */
|
||||
static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
|
||||
int b_stride, int width, int height) {
|
||||
int y, x;
|
||||
unsigned int sad = 0;
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
#define sadMxN(m, n) \
|
||||
unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride) { \
|
||||
return sad(src, src_stride, ref, ref_stride, m, n); \
|
||||
} \
|
||||
unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint8_t comp_pred[m * n]; \
|
||||
aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
|
||||
return sad(src, src_stride, comp_pred, m, m, n); \
|
||||
}
|
||||
|
||||
// depending on call sites, pass **ref_array to avoid & in subsequent call and
|
||||
// de-dup with 4D below.
|
||||
#define sadMxNxK(m, n, k) \
|
||||
void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref_array, int ref_stride, \
|
||||
uint32_t *sad_array) { \
|
||||
int i; \
|
||||
for (i = 0; i < k; ++i) \
|
||||
sad_array[i] = \
|
||||
aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
|
||||
}
|
||||
|
||||
// This appears to be equivalent to the above when k == 4 and refs is const
|
||||
#define sadMxNx4D(m, n) \
|
||||
void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *const ref_array[], \
|
||||
int ref_stride, uint32_t *sad_array) { \
|
||||
int i; \
|
||||
for (i = 0; i < 4; ++i) \
|
||||
sad_array[i] = \
|
||||
aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
|
||||
}
|
||||
|
||||
/* clang-format off */
|
||||
#if CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
// 128x128
|
||||
sadMxN(128, 128)
|
||||
sadMxNxK(128, 128, 3)
|
||||
sadMxNxK(128, 128, 8)
|
||||
sadMxNx4D(128, 128)
|
||||
|
||||
// 128x64
|
||||
sadMxN(128, 64)
|
||||
sadMxNx4D(128, 64)
|
||||
|
||||
// 64x128
|
||||
sadMxN(64, 128)
|
||||
sadMxNx4D(64, 128)
|
||||
#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
|
||||
// 64x64
|
||||
sadMxN(64, 64)
|
||||
sadMxNxK(64, 64, 3)
|
||||
sadMxNxK(64, 64, 8)
|
||||
sadMxNx4D(64, 64)
|
||||
|
||||
// 64x32
|
||||
sadMxN(64, 32)
|
||||
sadMxNx4D(64, 32)
|
||||
|
||||
// 32x64
|
||||
sadMxN(32, 64)
|
||||
sadMxNx4D(32, 64)
|
||||
|
||||
// 32x32
|
||||
sadMxN(32, 32)
|
||||
sadMxNxK(32, 32, 3)
|
||||
sadMxNxK(32, 32, 8)
|
||||
sadMxNx4D(32, 32)
|
||||
|
||||
// 32x16
|
||||
sadMxN(32, 16)
|
||||
sadMxNx4D(32, 16)
|
||||
|
||||
// 16x32
|
||||
sadMxN(16, 32)
|
||||
sadMxNx4D(16, 32)
|
||||
|
||||
// 16x16
|
||||
sadMxN(16, 16)
|
||||
sadMxNxK(16, 16, 3)
|
||||
sadMxNxK(16, 16, 8)
|
||||
sadMxNx4D(16, 16)
|
||||
|
||||
// 16x8
|
||||
sadMxN(16, 8)
|
||||
sadMxNxK(16, 8, 3)
|
||||
sadMxNxK(16, 8, 8)
|
||||
sadMxNx4D(16, 8)
|
||||
|
||||
// 8x16
|
||||
sadMxN(8, 16)
|
||||
sadMxNxK(8, 16, 3)
|
||||
sadMxNxK(8, 16, 8)
|
||||
sadMxNx4D(8, 16)
|
||||
|
||||
// 8x8
|
||||
sadMxN(8, 8)
|
||||
sadMxNxK(8, 8, 3)
|
||||
sadMxNxK(8, 8, 8)
|
||||
sadMxNx4D(8, 8)
|
||||
|
||||
// 8x4
|
||||
sadMxN(8, 4)
|
||||
sadMxNxK(8, 4, 8)
|
||||
sadMxNx4D(8, 4)
|
||||
|
||||
// 4x8
|
||||
sadMxN(4, 8)
|
||||
sadMxNxK(4, 8, 8)
|
||||
sadMxNx4D(4, 8)
|
||||
|
||||
// 4x4
|
||||
sadMxN(4, 4)
|
||||
sadMxNxK(4, 4, 3)
|
||||
sadMxNxK(4, 4, 8)
|
||||
sadMxNx4D(4, 4)
|
||||
/* clang-format on */
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
static INLINE
|
||||
unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
|
||||
int b_stride, int width, int height) {
|
||||
int y, x;
|
||||
unsigned int sad = 0;
|
||||
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||
for (y = 0; y < height; y++) {
|
||||
for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
|
||||
const uint16_t *b, int b_stride,
|
||||
int width, int height) {
|
||||
int y, x;
|
||||
unsigned int sad = 0;
|
||||
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
for (y = 0; y < height; y++) {
|
||||
for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
#define highbd_sadMxN(m, n) \
|
||||
unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int ref_stride) { \
|
||||
return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
|
||||
} \
|
||||
unsigned int aom_highbd_sad##m##x##n##_avg_c( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t comp_pred[m * n]; \
|
||||
aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
|
||||
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
|
||||
}
|
||||
|
||||
#define highbd_sadMxNxK(m, n, k) \
|
||||
void aom_highbd_sad##m##x##n##x##k##_c( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref_array, \
|
||||
int ref_stride, uint32_t *sad_array) { \
|
||||
int i; \
|
||||
for (i = 0; i < k; ++i) { \
|
||||
sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
|
||||
&ref_array[i], ref_stride); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define highbd_sadMxNx4D(m, n) \
|
||||
void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *const ref_array[], \
|
||||
int ref_stride, uint32_t *sad_array) { \
|
||||
int i; \
|
||||
for (i = 0; i < 4; ++i) { \
|
||||
sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
|
||||
ref_array[i], ref_stride); \
|
||||
} \
|
||||
}
|
||||
|
||||
/* clang-format off */
|
||||
#if CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
// 128x128
|
||||
highbd_sadMxN(128, 128)
|
||||
highbd_sadMxNxK(128, 128, 3)
|
||||
highbd_sadMxNxK(128, 128, 8)
|
||||
highbd_sadMxNx4D(128, 128)
|
||||
|
||||
// 128x64
|
||||
highbd_sadMxN(128, 64)
|
||||
highbd_sadMxNx4D(128, 64)
|
||||
|
||||
// 64x128
|
||||
highbd_sadMxN(64, 128)
|
||||
highbd_sadMxNx4D(64, 128)
|
||||
#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION
|
||||
|
||||
// 64x64
|
||||
highbd_sadMxN(64, 64)
|
||||
highbd_sadMxNxK(64, 64, 3)
|
||||
highbd_sadMxNxK(64, 64, 8)
|
||||
highbd_sadMxNx4D(64, 64)
|
||||
|
||||
// 64x32
|
||||
highbd_sadMxN(64, 32)
|
||||
highbd_sadMxNx4D(64, 32)
|
||||
|
||||
// 32x64
|
||||
highbd_sadMxN(32, 64)
|
||||
highbd_sadMxNx4D(32, 64)
|
||||
|
||||
// 32x32
|
||||
highbd_sadMxN(32, 32)
|
||||
highbd_sadMxNxK(32, 32, 3)
|
||||
highbd_sadMxNxK(32, 32, 8)
|
||||
highbd_sadMxNx4D(32, 32)
|
||||
|
||||
// 32x16
|
||||
highbd_sadMxN(32, 16)
|
||||
highbd_sadMxNx4D(32, 16)
|
||||
|
||||
// 16x32
|
||||
highbd_sadMxN(16, 32)
|
||||
highbd_sadMxNx4D(16, 32)
|
||||
|
||||
// 16x16
|
||||
highbd_sadMxN(16, 16)
|
||||
highbd_sadMxNxK(16, 16, 3)
|
||||
highbd_sadMxNxK(16, 16, 8)
|
||||
highbd_sadMxNx4D(16, 16)
|
||||
|
||||
// 16x8
|
||||
highbd_sadMxN(16, 8)
|
||||
highbd_sadMxNxK(16, 8, 3)
|
||||
highbd_sadMxNxK(16, 8, 8)
|
||||
highbd_sadMxNx4D(16, 8)
|
||||
|
||||
// 8x16
|
||||
highbd_sadMxN(8, 16)
|
||||
highbd_sadMxNxK(8, 16, 3)
|
||||
highbd_sadMxNxK(8, 16, 8)
|
||||
highbd_sadMxNx4D(8, 16)
|
||||
|
||||
// 8x8
|
||||
highbd_sadMxN(8, 8)
|
||||
highbd_sadMxNxK(8, 8, 3)
|
||||
highbd_sadMxNxK(8, 8, 8)
|
||||
highbd_sadMxNx4D(8, 8)
|
||||
|
||||
// 8x4
|
||||
highbd_sadMxN(8, 4)
|
||||
highbd_sadMxNxK(8, 4, 8)
|
||||
highbd_sadMxNx4D(8, 4)
|
||||
|
||||
// 4x8
|
||||
highbd_sadMxN(4, 8)
|
||||
highbd_sadMxNxK(4, 8, 8)
|
||||
highbd_sadMxNx4D(4, 8)
|
||||
|
||||
// 4x4
|
||||
highbd_sadMxN(4, 4)
|
||||
highbd_sadMxNxK(4, 4, 3)
|
||||
highbd_sadMxNxK(4, 4, 8)
|
||||
highbd_sadMxNx4D(4, 4)
|
||||
/* clang-format on */
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
|
||||
#if CONFIG_AV1 && CONFIG_EXT_INTER
|
||||
static INLINE
|
||||
unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b,
|
||||
int b_stride, const uint8_t *m, int m_stride,
|
||||
int width, int height) {
|
||||
int y, x;
|
||||
unsigned int sad = 0;
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
m += m_stride;
|
||||
}
|
||||
sad = (sad + 31) >> 6;
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
#define MASKSADMxN(m, n) \
|
||||
unsigned int aom_masked_sad##m##x##n##_c( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *msk, int msk_stride) { \
|
||||
return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, \
|
||||
n); \
|
||||
}
|
||||
|
||||
/* clang-format off */
|
||||
#if CONFIG_EXT_PARTITION
|
||||
MASKSADMxN(128, 128)
|
||||
MASKSADMxN(128, 64)
|
||||
MASKSADMxN(64, 128)
|
||||
#endif // CONFIG_EXT_PARTITION
|
||||
MASKSADMxN(64, 64)
|
||||
MASKSADMxN(64, 32)
|
||||
MASKSADMxN(32, 64)
|
||||
MASKSADMxN(32, 32)
|
||||
MASKSADMxN(32, 16)
|
||||
MASKSADMxN(16, 32)
|
||||
MASKSADMxN(16, 16)
|
||||
MASKSADMxN(16, 8)
|
||||
MASKSADMxN(8, 16)
|
||||
MASKSADMxN(8, 8)
|
||||
MASKSADMxN(8, 4)
|
||||
MASKSADMxN(4, 8)
|
||||
MASKSADMxN(4, 4)
|
||||
/* clang-format on */
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
static INLINE
|
||||
unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
const uint8_t *m, int m_stride, int width,
|
||||
int height) {
|
||||
int y, x;
|
||||
unsigned int sad = 0;
|
||||
const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
m += m_stride;
|
||||
}
|
||||
sad = (sad + 31) >> 6;
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
#define HIGHBD_MASKSADMXN(m, n) \
|
||||
unsigned int aom_highbd_masked_sad##m##x##n##_c( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *msk, int msk_stride) { \
|
||||
return highbd_masked_sad(src, src_stride, ref, ref_stride, msk, \
|
||||
msk_stride, m, n); \
|
||||
}
|
||||
|
||||
#if CONFIG_EXT_PARTITION
|
||||
HIGHBD_MASKSADMXN(128, 128)
|
||||
HIGHBD_MASKSADMXN(128, 64)
|
||||
HIGHBD_MASKSADMXN(64, 128)
|
||||
#endif // CONFIG_EXT_PARTITION
|
||||
HIGHBD_MASKSADMXN(64, 64)
|
||||
HIGHBD_MASKSADMXN(64, 32)
|
||||
HIGHBD_MASKSADMXN(32, 64)
|
||||
HIGHBD_MASKSADMXN(32, 32)
|
||||
HIGHBD_MASKSADMXN(32, 16)
|
||||
HIGHBD_MASKSADMXN(16, 32)
|
||||
HIGHBD_MASKSADMXN(16, 16)
|
||||
HIGHBD_MASKSADMXN(16, 8)
|
||||
HIGHBD_MASKSADMXN(8, 16)
|
||||
HIGHBD_MASKSADMXN(8, 8)
|
||||
HIGHBD_MASKSADMXN(8, 4)
|
||||
HIGHBD_MASKSADMXN(4, 8)
|
||||
HIGHBD_MASKSADMXN(4, 4)
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
#endif // CONFIG_AV1 && CONFIG_EXT_INTER
|
||||
|
||||
#if CONFIG_AV1 && CONFIG_MOTION_VAR
|
||||
// pre: predictor being evaluated
|
||||
// wsrc: target weighted prediction (has been *4096 to keep precision)
|
||||
// mask: 2d weights (scaled by 4096)
|
||||
static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
|
||||
const int32_t *wsrc, const int32_t *mask,
|
||||
int width, int height) {
|
||||
int y, x;
|
||||
unsigned int sad = 0;
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
for (x = 0; x < width; x++)
|
||||
sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
|
||||
|
||||
pre += pre_stride;
|
||||
wsrc += width;
|
||||
mask += width;
|
||||
}
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
#define OBMCSADMxN(m, n) \
|
||||
unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
|
||||
const int32_t *wsrc, \
|
||||
const int32_t *mask) { \
|
||||
return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
|
||||
}
|
||||
|
||||
/* clang-format off */
|
||||
#if CONFIG_EXT_PARTITION
|
||||
OBMCSADMxN(128, 128)
|
||||
OBMCSADMxN(128, 64)
|
||||
OBMCSADMxN(64, 128)
|
||||
#endif // CONFIG_EXT_PARTITION
|
||||
OBMCSADMxN(64, 64)
|
||||
OBMCSADMxN(64, 32)
|
||||
OBMCSADMxN(32, 64)
|
||||
OBMCSADMxN(32, 32)
|
||||
OBMCSADMxN(32, 16)
|
||||
OBMCSADMxN(16, 32)
|
||||
OBMCSADMxN(16, 16)
|
||||
OBMCSADMxN(16, 8)
|
||||
OBMCSADMxN(8, 16)
|
||||
OBMCSADMxN(8, 8)
|
||||
OBMCSADMxN(8, 4)
|
||||
OBMCSADMxN(4, 8)
|
||||
OBMCSADMxN(4, 4)
|
||||
/* clang-format on */
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
static INLINE
|
||||
unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
|
||||
const int32_t *wsrc, const int32_t *mask,
|
||||
int width, int height) {
|
||||
int y, x;
|
||||
unsigned int sad = 0;
|
||||
const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
for (x = 0; x < width; x++)
|
||||
sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
|
||||
|
||||
pre += pre_stride;
|
||||
wsrc += width;
|
||||
mask += width;
|
||||
}
|
||||
|
||||
return sad;
|
||||
}
|
||||
|
||||
#define HIGHBD_OBMCSADMXN(m, n) \
|
||||
unsigned int aom_highbd_obmc_sad##m##x##n##_c( \
|
||||
const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
|
||||
const int32_t *mask) { \
|
||||
return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
|
||||
}
|
||||
|
||||
/* clang-format off */
|
||||
#if CONFIG_EXT_PARTITION
|
||||
HIGHBD_OBMCSADMXN(128, 128)
|
||||
HIGHBD_OBMCSADMXN(128, 64)
|
||||
HIGHBD_OBMCSADMXN(64, 128)
|
||||
#endif // CONFIG_EXT_PARTITION
|
||||
HIGHBD_OBMCSADMXN(64, 64)
|
||||
HIGHBD_OBMCSADMXN(64, 32)
|
||||
HIGHBD_OBMCSADMXN(32, 64)
|
||||
HIGHBD_OBMCSADMXN(32, 32)
|
||||
HIGHBD_OBMCSADMXN(32, 16)
|
||||
HIGHBD_OBMCSADMXN(16, 32)
|
||||
HIGHBD_OBMCSADMXN(16, 16)
|
||||
HIGHBD_OBMCSADMXN(16, 8)
|
||||
HIGHBD_OBMCSADMXN(8, 16)
|
||||
HIGHBD_OBMCSADMXN(8, 8)
|
||||
HIGHBD_OBMCSADMXN(8, 4)
|
||||
HIGHBD_OBMCSADMXN(4, 8)
|
||||
HIGHBD_OBMCSADMXN(4, 4)
|
||||
/* clang-format on */
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
#endif // CONFIG_AV1 && CONFIG_MOTION_VAR
|
||||
@@ -1,259 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V128_INTRINSICS_H
|
||||
#define _V128_INTRINSICS_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "./v128_intrinsics_c.h"
|
||||
#include "./v64_intrinsics.h"
|
||||
|
||||
/* Fallback to plain, unoptimised C. */
|
||||
|
||||
typedef c_v128 v128;
|
||||
|
||||
SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
|
||||
SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
|
||||
SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
|
||||
SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
|
||||
return c_v128_from_64(hi, lo);
|
||||
}
|
||||
SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
|
||||
return c_v128_from_v64(hi, lo);
|
||||
}
|
||||
SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
||||
return c_v128_from_32(a, b, c, d);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_load_unaligned(const void *p) {
|
||||
return c_v128_load_unaligned(p);
|
||||
}
|
||||
SIMD_INLINE v128 v128_load_aligned(const void *p) {
|
||||
return c_v128_load_aligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
|
||||
c_v128_store_unaligned(p, a);
|
||||
}
|
||||
SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
|
||||
c_v128_store_aligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
|
||||
return c_v128_align(a, b, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
|
||||
SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
|
||||
SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
|
||||
SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
|
||||
|
||||
typedef uint32_t sad128_internal;
|
||||
SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
|
||||
SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
|
||||
return c_v128_sad_u8(s, a, b);
|
||||
}
|
||||
SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
|
||||
return c_v128_sad_u8_sum(s);
|
||||
}
|
||||
typedef uint32_t ssd128_internal;
|
||||
SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
|
||||
SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
|
||||
return c_v128_ssd_u8(s, a, b);
|
||||
}
|
||||
SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
|
||||
return c_v128_ssd_u8_sum(s);
|
||||
}
|
||||
SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
|
||||
return c_v128_dotp_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
|
||||
|
||||
SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
|
||||
SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
|
||||
SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
|
||||
SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
|
||||
SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
|
||||
SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
|
||||
SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
|
||||
SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
|
||||
SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
|
||||
SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
|
||||
SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
|
||||
SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
|
||||
SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
|
||||
SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
|
||||
SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
|
||||
|
||||
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
|
||||
SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
|
||||
return c_v128_mullo_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
|
||||
return c_v128_mulhi_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
|
||||
return c_v128_mullo_s32(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
|
||||
SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
|
||||
SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
|
||||
SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
|
||||
SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
|
||||
SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
|
||||
SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
|
||||
SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
|
||||
SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
|
||||
SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
|
||||
SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
|
||||
SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
|
||||
SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
|
||||
SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
|
||||
SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
|
||||
SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
|
||||
SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
|
||||
SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
|
||||
SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
|
||||
SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
|
||||
SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
|
||||
return c_v128_unziplo_8(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
|
||||
return c_v128_unziphi_8(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
|
||||
return c_v128_unziplo_16(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
|
||||
return c_v128_unziphi_16(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
|
||||
return c_v128_unziplo_32(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
|
||||
return c_v128_unziphi_32(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
|
||||
SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
|
||||
return c_v128_unpacklo_u8_s16(a);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
|
||||
return c_v128_unpackhi_u8_s16(a);
|
||||
}
|
||||
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
|
||||
return c_v128_pack_s32_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
|
||||
return c_v128_pack_s16_u8(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
|
||||
return c_v128_pack_s16_s8(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
|
||||
SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
|
||||
SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
|
||||
return c_v128_unpacklo_u16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
|
||||
return c_v128_unpacklo_s16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
|
||||
return c_v128_unpackhi_u16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
|
||||
return c_v128_unpackhi_s16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
|
||||
return c_v128_shuffle_8(a, pattern);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
|
||||
SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
|
||||
SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
|
||||
SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
|
||||
return c_v128_cmpgt_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
|
||||
return c_v128_cmplt_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
|
||||
return c_v128_shl_8(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
|
||||
return c_v128_shr_u8(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
|
||||
return c_v128_shr_s8(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
|
||||
return c_v128_shl_16(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
|
||||
return c_v128_shr_u16(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
|
||||
return c_v128_shr_s16(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
|
||||
return c_v128_shl_32(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
|
||||
return c_v128_shr_u32(a, c);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
|
||||
return c_v128_shr_s32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
|
||||
return c_v128_shr_n_byte(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
|
||||
return c_v128_shl_n_byte(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shl_n_8(v128 a, const unsigned int n) {
|
||||
return c_v128_shl_n_8(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shl_n_16(v128 a, const unsigned int n) {
|
||||
return c_v128_shl_n_16(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shl_n_32(v128 a, const unsigned int n) {
|
||||
return c_v128_shl_n_32(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_n_u8(v128 a, const unsigned int n) {
|
||||
return c_v128_shr_n_u8(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_n_u16(v128 a, const unsigned int n) {
|
||||
return c_v128_shr_n_u16(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_n_u32(v128 a, const unsigned int n) {
|
||||
return c_v128_shr_n_u32(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_n_s8(v128 a, const unsigned int n) {
|
||||
return c_v128_shr_n_s8(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_n_s16(v128 a, const unsigned int n) {
|
||||
return c_v128_shr_n_s16(a, n);
|
||||
}
|
||||
SIMD_INLINE v128 v128_shr_n_s32(v128 a, const unsigned int n) {
|
||||
return c_v128_shr_n_s32(a, n);
|
||||
}
|
||||
|
||||
#endif /* _V128_INTRINSICS_H */
|
||||
@@ -1,655 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V128_INTRINSICS_H
|
||||
#define _V128_INTRINSICS_H
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./v64_intrinsics_arm.h"
|
||||
|
||||
typedef int64x2_t v128;
|
||||
|
||||
SIMD_INLINE uint32_t v128_low_u32(v128 a) {
|
||||
return v64_low_u32(vget_low_s64(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
|
||||
|
||||
SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
|
||||
|
||||
SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
|
||||
|
||||
SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
|
||||
return vcombine_s64((uint64x1_t)b, (uint64x1_t)a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
||||
return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_load_aligned(const void *p) {
|
||||
return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_load_unaligned(const void *p) {
|
||||
return v128_load_aligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
|
||||
vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
|
||||
}
|
||||
|
||||
SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
|
||||
vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
|
||||
// The following functions require an immediate.
|
||||
// Some compilers will check this during optimisation, others wont.
|
||||
#if __OPTIMIZE__ && !__clang__
|
||||
return c ? vreinterpretq_s64_s8(
|
||||
vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
|
||||
: b;
|
||||
#else
|
||||
return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
|
||||
v64_align(v128_high_v64(b), v128_low_v64(b), c))
|
||||
: v128_from_v64(
|
||||
v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
|
||||
v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
|
||||
|
||||
SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
|
||||
|
||||
SIMD_INLINE v128 v128_dup_8(uint8_t x) {
|
||||
return vreinterpretq_s64_u8(vdupq_n_u8(x));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_dup_16(uint16_t x) {
|
||||
return vreinterpretq_s64_u16(vdupq_n_u16(x));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_dup_32(uint32_t x) {
|
||||
return vreinterpretq_s64_u32(vdupq_n_u32(x));
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
|
||||
return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
|
||||
v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
|
||||
uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
|
||||
return vget_lane_s32(
|
||||
vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_padd_s16(v128 a) {
|
||||
return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
|
||||
}
|
||||
|
||||
typedef struct { sad64_internal hi, lo; } sad128_internal;
|
||||
|
||||
SIMD_INLINE sad128_internal v128_sad_u8_init() {
|
||||
sad128_internal s;
|
||||
s.hi = s.lo = vdupq_n_u16(0);
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v128_sad_u8_sum().
|
||||
The result for more than 32 v128_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
|
||||
sad128_internal r;
|
||||
r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
|
||||
r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
|
||||
return r;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
|
||||
return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo));
|
||||
}
|
||||
|
||||
typedef struct { ssd64_internal hi, lo; } ssd128_internal;
|
||||
|
||||
SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
|
||||
ssd128_internal s;
|
||||
s.hi = s.lo = (ssd64_internal)(uint64_t)0;
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v128_ssd_u8_sum(). */
|
||||
SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
|
||||
ssd128_internal r;
|
||||
r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
|
||||
r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
|
||||
return r;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
|
||||
return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s16(
|
||||
vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s16(
|
||||
vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u32(
|
||||
vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_sub_u8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s16(
|
||||
vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s16(
|
||||
vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s8(
|
||||
vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s32(
|
||||
vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_abs_s16(v128 x) {
|
||||
return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
|
||||
return vreinterpretq_s64_s32(
|
||||
vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
|
||||
return vreinterpretq_s64_s16(
|
||||
vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
|
||||
return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
|
||||
v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
|
||||
return vreinterpretq_s64_s32(
|
||||
vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
|
||||
return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
|
||||
v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
|
||||
return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
|
||||
v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u16(
|
||||
vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s8(
|
||||
vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s8(
|
||||
vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s16(
|
||||
vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_s16(
|
||||
vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
|
||||
uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
|
||||
return vreinterpretq_s64_u8(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
|
||||
uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
|
||||
return vreinterpretq_s64_u8(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
|
||||
uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
|
||||
return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
|
||||
int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
|
||||
return vreinterpretq_s64_s16(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
|
||||
int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
|
||||
return vreinterpretq_s64_s16(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
|
||||
uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
|
||||
return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
|
||||
int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
|
||||
return vreinterpretq_s64_s32(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
|
||||
int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
|
||||
return vreinterpretq_s64_s32(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
|
||||
uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
|
||||
return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
|
||||
return v128_from_v64(vget_low_u64((uint64x2_t)a),
|
||||
vget_low_u64((uint64x2_t)b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
|
||||
return v128_from_v64(vget_high_u64((uint64x2_t)a),
|
||||
vget_high_u64((uint64x2_t)b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
|
||||
uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
|
||||
return vreinterpretq_s64_u8(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
|
||||
uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
|
||||
return vreinterpretq_s64_u8(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
|
||||
uint16x8x2_t r =
|
||||
vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
|
||||
return vreinterpretq_s64_u16(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
|
||||
uint16x8x2_t r =
|
||||
vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
|
||||
return vreinterpretq_s64_u16(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
|
||||
uint32x4x2_t r =
|
||||
vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
|
||||
return vreinterpretq_s64_u32(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
|
||||
uint32x4x2_t r =
|
||||
vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
|
||||
return vreinterpretq_s64_u32(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
|
||||
return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
|
||||
return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
|
||||
return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
|
||||
return v128_from_v64(
|
||||
vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
|
||||
vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
|
||||
return v128_from_v64(
|
||||
vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
|
||||
vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
|
||||
return v128_from_v64(
|
||||
vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
|
||||
vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
|
||||
return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
|
||||
return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
|
||||
return vreinterpretq_s64_u32(
|
||||
vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
|
||||
return vreinterpretq_s64_s32(
|
||||
vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
|
||||
return vreinterpretq_s64_u32(
|
||||
vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
|
||||
return vreinterpretq_s64_s32(
|
||||
vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
|
||||
return v128_from_64(
|
||||
(uint64_t)vreinterpret_s64_u8(
|
||||
vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
|
||||
vget_high_u8(vreinterpretq_u8_s64(x)) } },
|
||||
vreinterpret_u8_s64(vget_high_s64(pattern)))),
|
||||
(uint64_t)vreinterpret_s64_u8(
|
||||
vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
|
||||
vget_high_u8(vreinterpretq_u8_s64(x)) } },
|
||||
vreinterpret_u8_s64(vget_low_s64(pattern)))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u8(
|
||||
vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u16(
|
||||
vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u16(
|
||||
vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
|
||||
return vreinterpretq_s64_u16(
|
||||
vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
|
||||
return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
|
||||
vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
|
||||
return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8(
|
||||
vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
|
||||
return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8(
|
||||
vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
|
||||
return (c > 15) ? v128_zero()
|
||||
: vreinterpretq_s64_u16(
|
||||
vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
|
||||
return (c > 15) ? v128_zero()
|
||||
: vreinterpretq_s64_u16(
|
||||
vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
|
||||
return (c > 15) ? v128_ones()
|
||||
: vreinterpretq_s64_s16(
|
||||
vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
|
||||
return (c > 31) ? v128_zero()
|
||||
: vreinterpretq_s64_u32(
|
||||
vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
|
||||
return (c > 31) ? v128_zero()
|
||||
: vreinterpretq_s64_u32(
|
||||
vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
|
||||
return (c > 31) ? v128_ones()
|
||||
: vreinterpretq_s64_s32(
|
||||
vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
|
||||
}
|
||||
|
||||
#if __OPTIMIZE__ && !__clang__
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
|
||||
return n < 8
|
||||
? v128_from_64(
|
||||
(uint64_t)vorr_u64(
|
||||
vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
|
||||
n * 8),
|
||||
vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
|
||||
(8 - n) * 8)),
|
||||
(uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
|
||||
n * 8))
|
||||
: (n == 8 ? v128_from_64(
|
||||
(uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
|
||||
: v128_from_64((uint64_t)vshl_n_u64(
|
||||
vreinterpret_u64_s64(vget_low_s64(a)),
|
||||
(n - 8) * 8),
|
||||
0));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
|
||||
return n < 8
|
||||
? v128_from_64(
|
||||
vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
|
||||
vorr_u64(
|
||||
vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
|
||||
vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
|
||||
(8 - n) * 8)))
|
||||
: (n == 8
|
||||
? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a)))
|
||||
: v128_from_64(
|
||||
0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
|
||||
(n - 8) * 8)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_8(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_u8(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_s8(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_16(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_u16(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_s16(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_32(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_u32(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_s32(v128 a, const unsigned int c) {
|
||||
return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_byte(v128 a, const unsigned int n) {
|
||||
if (n < 8)
|
||||
return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
|
||||
v64_shr_n_byte(v128_low_v64(a), 8 - n)),
|
||||
v64_shl_n_byte(v128_low_v64(a), n));
|
||||
else
|
||||
return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_byte(v128 a, const unsigned int n) {
|
||||
if (n < 8)
|
||||
return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
|
||||
v64_or(v64_shr_n_byte(v128_low_v64(a), n),
|
||||
v64_shl_n_byte(v128_high_v64(a), 8 - n)));
|
||||
else
|
||||
return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_8(v128 a, const unsigned int c) {
|
||||
return v128_shl_8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_u8(v128 a, const unsigned int c) {
|
||||
return v128_shr_u8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_s8(v128 a, const unsigned int c) {
|
||||
return v128_shr_s8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_16(v128 a, const unsigned int c) {
|
||||
return v128_shl_16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_u16(v128 a, const unsigned int c) {
|
||||
return v128_shr_u16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_s16(v128 a, const unsigned int c) {
|
||||
return v128_shr_s16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_n_32(v128 a, const unsigned int c) {
|
||||
return v128_shl_32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_u32(v128 a, const unsigned int c) {
|
||||
return v128_shr_u32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_n_s32(v128 a, const unsigned int c) {
|
||||
return v128_shr_s32(a, c);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* _V128_INTRINSICS_H */
|
||||
@@ -1,684 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V128_INTRINSICS_C_H
|
||||
#define _V128_INTRINSICS_C_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "./v64_intrinsics_c.h"
|
||||
#include "./aom_config.h"
|
||||
|
||||
typedef union {
|
||||
uint8_t u8[16];
|
||||
uint16_t u16[8];
|
||||
uint32_t u32[4];
|
||||
uint64_t u64[2];
|
||||
int8_t s8[16];
|
||||
int16_t s16[8];
|
||||
int32_t s32[4];
|
||||
int64_t s64[2];
|
||||
c_v64 v64[2];
|
||||
} c_v128;
|
||||
|
||||
SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
|
||||
|
||||
SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
|
||||
|
||||
SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
|
||||
c_v128 t;
|
||||
t.u64[1] = hi;
|
||||
t.u64[0] = lo;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
|
||||
c_v128 t;
|
||||
t.v64[1] = hi;
|
||||
t.v64[0] = lo;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
|
||||
uint32_t d) {
|
||||
c_v128 t;
|
||||
t.u32[3] = a;
|
||||
t.u32[2] = b;
|
||||
t.u32[1] = c;
|
||||
t.u32[0] = d;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
|
||||
c_v128 t;
|
||||
uint8_t *pp = (uint8_t *)p;
|
||||
uint8_t *q = (uint8_t *)&t;
|
||||
int c;
|
||||
for (c = 0; c < 16; c++) q[c] = pp[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
|
||||
if (simd_check && (uintptr_t)p & 15) {
|
||||
fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
return c_v128_load_unaligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
|
||||
uint8_t *pp = (uint8_t *)p;
|
||||
uint8_t *q = (uint8_t *)&a;
|
||||
int c;
|
||||
for (c = 0; c < 16; c++) pp[c] = q[c];
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
|
||||
if (simd_check && (uintptr_t)p & 15) {
|
||||
fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
c_v128_store_unaligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_zero() {
|
||||
c_v128 t;
|
||||
t.u64[1] = t.u64[0] = 0;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
|
||||
c_v128 t;
|
||||
t.v64[1] = t.v64[0] = c_v64_dup_8(x);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
|
||||
c_v128 t;
|
||||
t.v64[1] = t.v64[0] = c_v64_dup_16(x);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
|
||||
c_v128 t;
|
||||
t.v64[1] = t.v64[0] = c_v64_dup_32(x);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
|
||||
return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
|
||||
c_v64_dotp_s16(a.v64[0], b.v64[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
|
||||
return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
|
||||
}
|
||||
|
||||
typedef uint32_t c_sad128_internal;
|
||||
|
||||
SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v128_sad_u8_sum().
|
||||
The result for more than 32 v128_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
|
||||
c_v128 b) {
|
||||
int c;
|
||||
for (c = 0; c < 16; c++)
|
||||
s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
|
||||
return s;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
|
||||
|
||||
typedef uint32_t c_ssd128_internal;
|
||||
|
||||
SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v128_ssd_u8_sum(). */
|
||||
SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
|
||||
c_v128 b) {
|
||||
int c;
|
||||
for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
|
||||
return s;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
|
||||
c_v64_or(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
|
||||
c_v64_xor(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
|
||||
c_v64_and(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
|
||||
c_v64_andn(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
|
||||
c_v64_add_8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
|
||||
c_v64_add_16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_sadd_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
|
||||
c_v64_add_32(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
|
||||
c_v128 t;
|
||||
t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
|
||||
t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
|
||||
t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
|
||||
t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
|
||||
c_v64_sub_8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
|
||||
c_v64_ssub_u8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
|
||||
c_v64_ssub_s8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
|
||||
c_v64_sub_16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_ssub_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
|
||||
c_v64_sub_32(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
|
||||
return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 lo_bits = c_v64_mullo_s16(a, b);
|
||||
c_v64 hi_bits = c_v64_mulhi_s16(a, b);
|
||||
return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
|
||||
c_v64_ziplo_16(hi_bits, lo_bits));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_mullo_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_mulhi_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
|
||||
c_v64_mullo_s32(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_madd_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
|
||||
c_v64_madd_us8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
|
||||
c_v64_avg_u8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
|
||||
c_v64_rdavg_u8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
|
||||
c_v64_avg_u16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
|
||||
c_v64_min_u8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
|
||||
c_v64_max_u8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
|
||||
c_v64_min_s8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
|
||||
c_v64_max_s8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_min_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_max_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
|
||||
c_v64_ziplo_8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
|
||||
c_v64_ziplo_8(a.v64[1], b.v64[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
|
||||
c_v64_ziplo_16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
|
||||
c_v64_ziplo_16(a.v64[1], b.v64[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
|
||||
c_v64_ziplo_32(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
|
||||
c_v64_ziplo_32(a.v64[1], b.v64[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(a.v64[0], b.v64[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(a.v64[1], b.v64[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
|
||||
return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
|
||||
c_v128 t;
|
||||
if (mode) {
|
||||
t.u8[15] = b.u8[15];
|
||||
t.u8[14] = b.u8[13];
|
||||
t.u8[13] = b.u8[11];
|
||||
t.u8[12] = b.u8[9];
|
||||
t.u8[11] = b.u8[7];
|
||||
t.u8[10] = b.u8[5];
|
||||
t.u8[9] = b.u8[3];
|
||||
t.u8[8] = b.u8[1];
|
||||
t.u8[7] = a.u8[15];
|
||||
t.u8[6] = a.u8[13];
|
||||
t.u8[5] = a.u8[11];
|
||||
t.u8[4] = a.u8[9];
|
||||
t.u8[3] = a.u8[7];
|
||||
t.u8[2] = a.u8[5];
|
||||
t.u8[1] = a.u8[3];
|
||||
t.u8[0] = a.u8[1];
|
||||
} else {
|
||||
t.u8[15] = a.u8[14];
|
||||
t.u8[14] = a.u8[12];
|
||||
t.u8[13] = a.u8[10];
|
||||
t.u8[12] = a.u8[8];
|
||||
t.u8[11] = a.u8[6];
|
||||
t.u8[10] = a.u8[4];
|
||||
t.u8[9] = a.u8[2];
|
||||
t.u8[8] = a.u8[0];
|
||||
t.u8[7] = b.u8[14];
|
||||
t.u8[6] = b.u8[12];
|
||||
t.u8[5] = b.u8[10];
|
||||
t.u8[4] = b.u8[8];
|
||||
t.u8[3] = b.u8[6];
|
||||
t.u8[2] = b.u8[4];
|
||||
t.u8[1] = b.u8[2];
|
||||
t.u8[0] = b.u8[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
|
||||
: _c_v128_unzip_8(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
|
||||
: _c_v128_unzip_8(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
|
||||
c_v128 t;
|
||||
if (mode) {
|
||||
t.u16[7] = b.u16[7];
|
||||
t.u16[6] = b.u16[5];
|
||||
t.u16[5] = b.u16[3];
|
||||
t.u16[4] = b.u16[1];
|
||||
t.u16[3] = a.u16[7];
|
||||
t.u16[2] = a.u16[5];
|
||||
t.u16[1] = a.u16[3];
|
||||
t.u16[0] = a.u16[1];
|
||||
} else {
|
||||
t.u16[7] = a.u16[6];
|
||||
t.u16[6] = a.u16[4];
|
||||
t.u16[5] = a.u16[2];
|
||||
t.u16[4] = a.u16[0];
|
||||
t.u16[3] = b.u16[6];
|
||||
t.u16[2] = b.u16[4];
|
||||
t.u16[1] = b.u16[2];
|
||||
t.u16[0] = b.u16[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
|
||||
: _c_v128_unzip_16(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
|
||||
: _c_v128_unzip_16(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
|
||||
c_v128 t;
|
||||
if (mode) {
|
||||
t.u32[3] = b.u32[3];
|
||||
t.u32[2] = b.u32[1];
|
||||
t.u32[1] = a.u32[3];
|
||||
t.u32[0] = a.u32[1];
|
||||
} else {
|
||||
t.u32[3] = a.u32[2];
|
||||
t.u32[2] = a.u32[0];
|
||||
t.u32[1] = b.u32[2];
|
||||
t.u32[0] = b.u32[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
|
||||
: _c_v128_unzip_32(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
|
||||
: _c_v128_unzip_32(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
|
||||
c_v64_unpacklo_u8_s16(a.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
|
||||
c_v64_unpacklo_u8_s16(a.v64[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
|
||||
c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
|
||||
c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
|
||||
c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
|
||||
c_v64_unpacklo_u16_s32(a.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
|
||||
c_v64_unpacklo_s16_s32(a.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
|
||||
c_v64_unpacklo_u16_s32(a.v64[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
|
||||
return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
|
||||
c_v64_unpacklo_s16_s32(a.v64[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
|
||||
c_v128 t;
|
||||
int c;
|
||||
for (c = 0; c < 16; c++) {
|
||||
if (pattern.u8[c] & ~15) {
|
||||
fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c],
|
||||
c);
|
||||
abort();
|
||||
}
|
||||
t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
|
||||
: pattern.u8[c] & 15];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
|
||||
c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
|
||||
c_v64_cmplt_s8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
|
||||
c_v64_cmpeq_8(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
|
||||
c_v64_cmplt_s16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
|
||||
return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
|
||||
c_v64_cmpeq_16(a.v64[0], b.v64[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
|
||||
if (n < 8)
|
||||
return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
|
||||
c_v64_shr_n_byte(a.v64[0], 8 - n)),
|
||||
c_v64_shl_n_byte(a.v64[0], n));
|
||||
else
|
||||
return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
|
||||
if (n < 8)
|
||||
return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
|
||||
c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
|
||||
c_v64_shl_n_byte(a.v64[1], 8 - n)));
|
||||
else
|
||||
return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
|
||||
if (simd_check && c > 15) {
|
||||
fprintf(stderr, "Error: undefined alignment %d\n", c);
|
||||
abort();
|
||||
}
|
||||
return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
|
||||
: b;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
|
||||
c_v64_shr_u16(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
|
||||
c_v64_shr_s16(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
|
||||
c_v64_shr_u32(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
|
||||
return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
|
||||
c_v64_shr_s32(a.v64[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shl_8(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shl_16(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shl_32(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shr_u8(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shr_u16(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shr_u32(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shr_s8(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shr_s16(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
|
||||
return c_v128_shr_s32(a, n);
|
||||
}
|
||||
|
||||
#endif /* _V128_INTRINSICS_C_H */
|
||||
@@ -1,488 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V128_INTRINSICS_H
|
||||
#define _V128_INTRINSICS_H
|
||||
|
||||
#include "./v64_intrinsics_x86.h"
|
||||
|
||||
typedef __m128i v128;
|
||||
|
||||
SIMD_INLINE uint32_t v128_low_u32(v128 a) {
|
||||
return (uint32_t)_mm_cvtsi128_si32(a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v128_low_v64(v128 a) {
|
||||
return _mm_unpacklo_epi64(a, v64_zero());
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
|
||||
|
||||
SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
|
||||
return _mm_unpacklo_epi64(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
|
||||
return v128_from_v64(v64_from_64(a), v64_from_64(b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
||||
return _mm_set_epi32(a, b, c, d);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_load_aligned(const void *p) {
|
||||
return _mm_load_si128((__m128i *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_load_unaligned(const void *p) {
|
||||
#if defined(__SSSE3__)
|
||||
return (__m128i)_mm_lddqu_si128((__m128i *)p);
|
||||
#else
|
||||
return _mm_loadu_si128((__m128i *)p);
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
|
||||
_mm_store_si128((__m128i *)p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
|
||||
_mm_storeu_si128((__m128i *)p, a);
|
||||
}
|
||||
|
||||
// The following function requires an immediate.
|
||||
// Some compilers will check this during optimisation, others wont.
|
||||
#if __OPTIMIZE__ && !__clang__
|
||||
#if defined(__SSSE3__)
|
||||
SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
|
||||
return c ? _mm_alignr_epi8(a, b, c) : b;
|
||||
}
|
||||
#else
|
||||
#define v128_align(a, b, c) \
|
||||
((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
|
||||
#endif
|
||||
#else
|
||||
#if defined(__SSSE3__)
|
||||
#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b))
|
||||
#else
|
||||
#define v128_align(a, b, c) \
|
||||
((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
|
||||
|
||||
SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
|
||||
|
||||
SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
|
||||
|
||||
SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
|
||||
|
||||
SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_padd_s16(v128 a) {
|
||||
return _mm_madd_epi16(a, _mm_set1_epi16(1));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_abs_s16(v128 a) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_abs_epi16(a);
|
||||
#else
|
||||
return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
|
||||
return _mm_unpacklo_epi8(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
|
||||
return _mm_unpackhi_epi8(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
|
||||
return _mm_unpacklo_epi16(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
|
||||
return _mm_unpackhi_epi16(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
|
||||
return _mm_unpacklo_epi32(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
|
||||
return _mm_unpackhi_epi32(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
|
||||
return _mm_unpacklo_epi64(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
|
||||
return _mm_unpackhi_epi64(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
|
||||
|
||||
SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
|
||||
|
||||
SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
|
||||
|
||||
SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
|
||||
return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
|
||||
#if defined(__SSSE3__)
|
||||
#ifdef __x86_64__
|
||||
v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
|
||||
#else
|
||||
v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
|
||||
#endif
|
||||
return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
|
||||
_mm_shuffle_epi8(a, order));
|
||||
#else
|
||||
return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
|
||||
return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
|
||||
#if defined(__SSSE3__)
|
||||
#ifdef __x86_64__
|
||||
v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
|
||||
#else
|
||||
v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
|
||||
#endif
|
||||
return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
|
||||
_mm_shuffle_epi8(a, order));
|
||||
#else
|
||||
return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
|
||||
return _mm_castps_si128(_mm_shuffle_ps(
|
||||
_mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
|
||||
return _mm_castps_si128(_mm_shuffle_ps(
|
||||
_mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
|
||||
return _mm_unpacklo_epi8(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
|
||||
return _mm_unpacklo_epi8(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
|
||||
return _mm_unpackhi_epi8(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
|
||||
return _mm_packs_epi32(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
|
||||
return _mm_packus_epi16(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
|
||||
return _mm_packs_epi16(b, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
|
||||
return _mm_unpacklo_epi16(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
|
||||
return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
|
||||
return _mm_unpacklo_epi16(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
|
||||
return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
|
||||
return _mm_unpackhi_epi16(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
|
||||
return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(x, pattern);
|
||||
#else
|
||||
v128 output;
|
||||
unsigned char *input = (unsigned char *)&x;
|
||||
unsigned char *index = (unsigned char *)&pattern;
|
||||
char *selected = (char *)&output;
|
||||
int counter;
|
||||
|
||||
for (counter = 0; counter < 16; counter++) {
|
||||
selected[counter] = input[index[counter] & 15];
|
||||
}
|
||||
|
||||
return output;
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
|
||||
v128 r = _mm_madd_epi16(a, b);
|
||||
#if defined(__SSE4_1__) && defined(__x86_64__)
|
||||
v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
|
||||
_mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
|
||||
return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
|
||||
#else
|
||||
return (int64_t)_mm_cvtsi128_si32(r) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
|
||||
v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
|
||||
return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
|
||||
}
|
||||
|
||||
typedef v128 sad128_internal;
|
||||
|
||||
SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v128_sad_sum().
|
||||
The result for more than 32 v128_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
|
||||
return _mm_add_epi64(s, _mm_sad_epu8(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
|
||||
return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
|
||||
}
|
||||
|
||||
typedef v128 ssd128_internal;
|
||||
|
||||
SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v128_ssd_sum(). */
|
||||
SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
|
||||
v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
|
||||
_mm_unpacklo_epi8(b, _mm_setzero_si128()));
|
||||
v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
|
||||
_mm_unpackhi_epi8(b, _mm_setzero_si128()));
|
||||
v128 rl = _mm_madd_epi16(l, l);
|
||||
v128 rh = _mm_madd_epi16(h, h);
|
||||
v128 c = _mm_cvtsi32_si128(32);
|
||||
rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8));
|
||||
rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4));
|
||||
rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8));
|
||||
rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4));
|
||||
return _mm_add_epi64(
|
||||
s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
|
||||
return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
|
||||
|
||||
SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
|
||||
v64 lo_bits = v64_mullo_s16(a, b);
|
||||
v64 hi_bits = v64_mulhi_s16(a, b);
|
||||
return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
|
||||
v64_ziplo_16(hi_bits, lo_bits));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
|
||||
return _mm_mullo_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
|
||||
return _mm_mulhi_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_mullo_epi32(a, b);
|
||||
#else
|
||||
return _mm_unpacklo_epi32(
|
||||
_mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
|
||||
_mm_shuffle_epi32(
|
||||
_mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_maddubs_epi16(a, b);
|
||||
#else
|
||||
return _mm_packs_epi32(
|
||||
_mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
|
||||
_mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
|
||||
_mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
|
||||
_mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
|
||||
return _mm_sub_epi8(_mm_avg_epu8(a, b),
|
||||
_mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_min_epi8(a, b);
|
||||
#else
|
||||
v128 mask = _mm_cmplt_epi8(a, b);
|
||||
return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_max_epi8(a, b);
|
||||
#else
|
||||
v128 mask = _mm_cmplt_epi8(b, a);
|
||||
return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
|
||||
return _mm_cmpgt_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
|
||||
return _mm_cmplt_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
|
||||
return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
|
||||
_mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
|
||||
return _mm_and_si128(_mm_set1_epi8(0xff >> c),
|
||||
_mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
|
||||
__m128i x = _mm_cvtsi32_si128(c + 8);
|
||||
return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
|
||||
_mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
|
||||
return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
|
||||
return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
|
||||
return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
|
||||
return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
|
||||
return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
|
||||
return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
/* These intrinsics require immediate values, so we must use #defines
|
||||
to enforce that. */
|
||||
#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
|
||||
#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
|
||||
#define v128_shl_n_8(a, c) \
|
||||
_mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
|
||||
#define v128_shr_n_u8(a, c) \
|
||||
_mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
|
||||
#define v128_shr_n_s8(a, c) \
|
||||
_mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
|
||||
_mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
|
||||
#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
|
||||
#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
|
||||
#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
|
||||
#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
|
||||
#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
|
||||
#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
|
||||
|
||||
#endif /* _V128_INTRINSICS_H */
|
||||
@@ -1,274 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V256_INTRINSICS_H
|
||||
#define _V256_INTRINSICS_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "./v256_intrinsics_c.h"
|
||||
#include "./v128_intrinsics.h"
|
||||
#include "./v64_intrinsics.h"
|
||||
|
||||
/* Fallback to plain, unoptimised C. */
|
||||
|
||||
typedef c_v256 v256;
|
||||
|
||||
SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
|
||||
SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
|
||||
SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
|
||||
SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
|
||||
SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
|
||||
return c_v256_from_v128(hi, lo);
|
||||
}
|
||||
SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
|
||||
return c_v256_from_64(a, b, c, d);
|
||||
}
|
||||
SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
|
||||
return c_v256_from_v64(a, b, c, d);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_load_unaligned(const void *p) {
|
||||
return c_v256_load_unaligned(p);
|
||||
}
|
||||
SIMD_INLINE v256 v256_load_aligned(const void *p) {
|
||||
return c_v256_load_aligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
|
||||
c_v256_store_unaligned(p, a);
|
||||
}
|
||||
SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
|
||||
c_v256_store_aligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_align(v256 a, v256 b, const unsigned int c) {
|
||||
return c_v256_align(a, b, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
|
||||
SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
|
||||
SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
|
||||
SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
|
||||
|
||||
typedef uint32_t sad256_internal;
|
||||
SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
|
||||
SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
|
||||
return c_v256_sad_u8(s, a, b);
|
||||
}
|
||||
SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
|
||||
return c_v256_sad_u8_sum(s);
|
||||
}
|
||||
typedef uint32_t ssd256_internal;
|
||||
SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
|
||||
SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
|
||||
return c_v256_ssd_u8(s, a, b);
|
||||
}
|
||||
SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
|
||||
return c_v256_ssd_u8_sum(s);
|
||||
}
|
||||
SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
|
||||
return c_v256_dotp_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
|
||||
|
||||
SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
|
||||
SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
|
||||
SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
|
||||
SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
|
||||
SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
|
||||
SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
|
||||
SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
|
||||
SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
|
||||
SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
|
||||
SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
|
||||
SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
|
||||
SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
|
||||
SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
|
||||
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
|
||||
SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
|
||||
|
||||
SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
|
||||
SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
|
||||
return c_v256_mullo_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
|
||||
return c_v256_mulhi_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
|
||||
return c_v256_mullo_s32(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
|
||||
SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
|
||||
SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
|
||||
SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
|
||||
SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
|
||||
SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
|
||||
SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
|
||||
SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
|
||||
SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
|
||||
SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
|
||||
SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
|
||||
SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
|
||||
SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
|
||||
SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
|
||||
SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
|
||||
SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
|
||||
SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
|
||||
SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
|
||||
return c_v256_ziplo_128(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
|
||||
return c_v256_ziphi_128(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
|
||||
SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
|
||||
SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
|
||||
SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
|
||||
return c_v256_unziplo_8(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
|
||||
return c_v256_unziphi_8(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
|
||||
return c_v256_unziplo_16(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
|
||||
return c_v256_unziphi_16(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
|
||||
return c_v256_unziplo_32(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
|
||||
return c_v256_unziphi_32(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
|
||||
SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
|
||||
return c_v256_unpacklo_u8_s16(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
|
||||
return c_v256_unpackhi_u8_s16(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
|
||||
return c_v256_pack_s32_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
|
||||
return c_v256_pack_s16_u8(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
|
||||
return c_v256_pack_s16_s8(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
|
||||
return c_v256_unpack_u16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
|
||||
return c_v256_unpack_s16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
|
||||
return c_v256_unpacklo_u16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
|
||||
return c_v256_unpacklo_s16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
|
||||
return c_v256_unpackhi_u16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
|
||||
return c_v256_unpackhi_s16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
|
||||
return c_v256_shuffle_8(a, pattern);
|
||||
}
|
||||
SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
|
||||
return c_v256_pshuffle_8(a, pattern);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
|
||||
SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
|
||||
SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
|
||||
SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
|
||||
return c_v256_cmpgt_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
|
||||
return c_v256_cmplt_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
|
||||
return c_v256_shl_8(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
|
||||
return c_v256_shr_u8(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
|
||||
return c_v256_shr_s8(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
|
||||
return c_v256_shl_16(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
|
||||
return c_v256_shr_u16(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
|
||||
return c_v256_shr_s16(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
|
||||
return c_v256_shl_32(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
|
||||
return c_v256_shr_u32(a, c);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
|
||||
return c_v256_shr_s32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_n_byte(v256 a, const unsigned int n) {
|
||||
return c_v256_shr_n_byte(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shl_n_byte(v256 a, const unsigned int n) {
|
||||
return c_v256_shl_n_byte(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shl_n_8(v256 a, const unsigned int n) {
|
||||
return c_v256_shl_n_8(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shl_n_16(v256 a, const unsigned int n) {
|
||||
return c_v256_shl_n_16(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shl_n_32(v256 a, const unsigned int n) {
|
||||
return c_v256_shl_n_32(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_n_u8(v256 a, const unsigned int n) {
|
||||
return c_v256_shr_n_u8(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_n_u16(v256 a, const unsigned int n) {
|
||||
return c_v256_shr_n_u16(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_n_u32(v256 a, const unsigned int n) {
|
||||
return c_v256_shr_n_u32(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_n_s8(v256 a, const unsigned int n) {
|
||||
return c_v256_shr_n_s8(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_n_s16(v256 a, const unsigned int n) {
|
||||
return c_v256_shr_n_s16(a, n);
|
||||
}
|
||||
SIMD_INLINE v256 v256_shr_n_s32(v256 a, const unsigned int n) {
|
||||
return c_v256_shr_n_s32(a, n);
|
||||
}
|
||||
|
||||
#endif /* _V256_INTRINSICS_H */
|
||||
@@ -1,17 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V256_INTRINSICS_H
|
||||
#define _V256_INTRINSICS_H
|
||||
|
||||
#include "./v256_intrinsics_v128.h"
|
||||
|
||||
#endif /* _V256_INTRINSICS_H */
|
||||
@@ -1,701 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V256_INTRINSICS_C_H
|
||||
#define _V256_INTRINSICS_C_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "./v128_intrinsics_c.h"
|
||||
#include "./aom_config.h"
|
||||
|
||||
typedef union {
|
||||
uint8_t u8[32];
|
||||
uint16_t u16[16];
|
||||
uint32_t u32[8];
|
||||
uint64_t u64[4];
|
||||
int8_t s8[32];
|
||||
int16_t s16[16];
|
||||
int32_t s32[8];
|
||||
int64_t s64[4];
|
||||
c_v64 v64[4];
|
||||
c_v128 v128[2];
|
||||
} c_v256;
|
||||
|
||||
SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
|
||||
|
||||
SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
|
||||
|
||||
SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
|
||||
|
||||
SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
|
||||
c_v256 t;
|
||||
t.v128[1] = hi;
|
||||
t.v128[0] = lo;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
|
||||
uint64_t d) {
|
||||
c_v256 t;
|
||||
t.u64[3] = a;
|
||||
t.u64[2] = b;
|
||||
t.u64[1] = c;
|
||||
t.u64[0] = d;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
|
||||
c_v256 t;
|
||||
t.u64[3] = a.u64;
|
||||
t.u64[2] = b.u64;
|
||||
t.u64[1] = c.u64;
|
||||
t.u64[0] = d.u64;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
|
||||
c_v256 t;
|
||||
uint8_t *pp = (uint8_t *)p;
|
||||
uint8_t *q = (uint8_t *)&t;
|
||||
int c;
|
||||
for (c = 0; c < 32; c++) q[c] = pp[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
|
||||
if (simd_check && (uintptr_t)p & 31) {
|
||||
fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
return c_v256_load_unaligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
|
||||
uint8_t *pp = (uint8_t *)p;
|
||||
uint8_t *q = (uint8_t *)&a;
|
||||
int c;
|
||||
for (c = 0; c < 32; c++) pp[c] = q[c];
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
|
||||
if (simd_check && (uintptr_t)p & 31) {
|
||||
fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
c_v256_store_unaligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_zero() {
|
||||
c_v256 t;
|
||||
t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
|
||||
c_v256 t;
|
||||
t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
|
||||
c_v256 t;
|
||||
t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
|
||||
c_v256 t;
|
||||
t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
|
||||
return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
|
||||
c_v128_dotp_s16(a.v128[0], b.v128[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
|
||||
return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
|
||||
}
|
||||
|
||||
typedef uint32_t c_sad256_internal;
|
||||
|
||||
SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v256_sad_u8_sum().
|
||||
The result for more than 16 v256_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
|
||||
c_v256 b) {
|
||||
int c;
|
||||
for (c = 0; c < 32; c++)
|
||||
s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
|
||||
return s;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
|
||||
|
||||
typedef uint32_t c_ssd256_internal;
|
||||
|
||||
SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v256_ssd_u8_sum(). */
|
||||
SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
|
||||
c_v256 b) {
|
||||
int c;
|
||||
for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
|
||||
return s;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
|
||||
c_v128_or(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
|
||||
c_v128_xor(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
|
||||
c_v128_and(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
|
||||
c_v128_andn(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
|
||||
c_v128_add_8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
|
||||
c_v128_add_16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_sadd_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
|
||||
c_v128_add_32(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
|
||||
c_v256 t;
|
||||
t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
|
||||
t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
|
||||
t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
|
||||
t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
|
||||
t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
|
||||
t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
|
||||
t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
|
||||
t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
|
||||
c_v128_sub_8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
|
||||
c_v128_ssub_u8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
|
||||
c_v128_ssub_s8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
|
||||
c_v128_sub_16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_ssub_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
|
||||
c_v128_sub_32(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
|
||||
return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
|
||||
c_v128 lo_bits = c_v128_mullo_s16(a, b);
|
||||
c_v128 hi_bits = c_v128_mulhi_s16(a, b);
|
||||
return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
|
||||
c_v128_ziplo_16(hi_bits, lo_bits));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_mullo_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_mulhi_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
|
||||
c_v128_mullo_s32(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_madd_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
|
||||
c_v128_madd_us8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
|
||||
c_v128_avg_u8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
|
||||
c_v128_rdavg_u8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
|
||||
c_v128_avg_u16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
|
||||
c_v128_min_u8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
|
||||
c_v128_max_u8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
|
||||
c_v128_min_s8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
|
||||
c_v128_max_s8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_min_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_max_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
|
||||
c_v128_ziplo_8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
|
||||
c_v128_ziplo_8(a.v128[1], b.v128[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
|
||||
c_v128_ziplo_16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
|
||||
c_v128_ziplo_16(a.v128[1], b.v128[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
|
||||
c_v128_ziplo_32(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
|
||||
c_v128_ziplo_32(a.v128[1], b.v128[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
|
||||
c_v128_ziplo_64(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
|
||||
c_v128_ziplo_64(a.v128[1], b.v128[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(a.v128[0], b.v128[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(a.v128[1], b.v128[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
|
||||
return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
|
||||
c_v256 t;
|
||||
int i;
|
||||
if (mode) {
|
||||
for (i = 0; i < 16; i++) {
|
||||
t.u8[i] = a.u8[i * 2 + 1];
|
||||
t.u8[i + 16] = b.u8[i * 2 + 1];
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < 16; i++) {
|
||||
t.u8[i] = b.u8[i * 2];
|
||||
t.u8[i + 16] = a.u8[i * 2];
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
|
||||
: _c_v256_unzip_8(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
|
||||
: _c_v256_unzip_8(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
|
||||
c_v256 t;
|
||||
int i;
|
||||
if (mode) {
|
||||
for (i = 0; i < 8; i++) {
|
||||
t.u16[i] = a.u16[i * 2 + 1];
|
||||
t.u16[i + 8] = b.u16[i * 2 + 1];
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < 8; i++) {
|
||||
t.u16[i] = b.u16[i * 2];
|
||||
t.u16[i + 8] = a.u16[i * 2];
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
|
||||
: _c_v256_unzip_16(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
|
||||
: _c_v256_unzip_16(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
|
||||
c_v256 t;
|
||||
if (mode) {
|
||||
t.u32[7] = b.u32[7];
|
||||
t.u32[6] = b.u32[5];
|
||||
t.u32[5] = b.u32[3];
|
||||
t.u32[4] = b.u32[1];
|
||||
t.u32[3] = a.u32[7];
|
||||
t.u32[2] = a.u32[5];
|
||||
t.u32[1] = a.u32[3];
|
||||
t.u32[0] = a.u32[1];
|
||||
} else {
|
||||
t.u32[7] = a.u32[6];
|
||||
t.u32[6] = a.u32[4];
|
||||
t.u32[5] = a.u32[2];
|
||||
t.u32[4] = a.u32[0];
|
||||
t.u32[3] = b.u32[6];
|
||||
t.u32[2] = b.u32[4];
|
||||
t.u32[1] = b.u32[2];
|
||||
t.u32[0] = b.u32[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
|
||||
: _c_v256_unzip_32(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
|
||||
: _c_v256_unzip_32(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
|
||||
c_v128_unpacklo_u8_s16(a.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
|
||||
c_v128_unpacklo_u8_s16(a.v128[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
|
||||
c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
|
||||
c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
|
||||
c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
|
||||
c_v128_unpacklo_u16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
|
||||
c_v128_unpacklo_s16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
|
||||
c_v128_unpacklo_u16_s32(a.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
|
||||
c_v128_unpacklo_s16_s32(a.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
|
||||
c_v128_unpacklo_u16_s32(a.v128[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
|
||||
return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
|
||||
c_v128_unpacklo_s16_s32(a.v128[1]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
|
||||
c_v256 t;
|
||||
int c;
|
||||
for (c = 0; c < 32; c++) {
|
||||
if (pattern.u8[c] & ~31) {
|
||||
fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
|
||||
c);
|
||||
abort();
|
||||
}
|
||||
t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
|
||||
: pattern.u8[c] & 31];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
|
||||
SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
|
||||
return c_v256_from_v128(
|
||||
c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
|
||||
c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
|
||||
c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
|
||||
c_v128_cmplt_s8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
|
||||
c_v128_cmpeq_8(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
|
||||
c_v128_cmplt_s16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
|
||||
return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
|
||||
c_v128_cmpeq_16(a.v128[0], b.v128[0]));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, const unsigned int n) {
|
||||
if (n < 16)
|
||||
return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
|
||||
c_v128_shr_n_byte(a.v128[0], 16 - n)),
|
||||
c_v128_shl_n_byte(a.v128[0], n));
|
||||
else if (n > 16)
|
||||
return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
|
||||
c_v128_zero());
|
||||
else
|
||||
return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, const unsigned int n) {
|
||||
if (n < 16)
|
||||
return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
|
||||
c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
|
||||
c_v128_shl_n_byte(a.v128[1], 16 - n)));
|
||||
else if (n > 16)
|
||||
return c_v256_from_v128(c_v128_zero(),
|
||||
c_v128_shr_n_byte(a.v128[1], n - 16));
|
||||
else
|
||||
return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, const unsigned int c) {
|
||||
if (simd_check && c > 31) {
|
||||
fprintf(stderr, "Error: undefined alignment %d\n", c);
|
||||
abort();
|
||||
}
|
||||
return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
|
||||
: b;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
|
||||
c_v128_shl_8(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
|
||||
c_v128_shr_u8(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
|
||||
c_v128_shr_s8(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
|
||||
c_v128_shl_16(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
|
||||
c_v128_shr_u16(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
|
||||
c_v128_shr_s16(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
|
||||
c_v128_shl_32(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
|
||||
c_v128_shr_u32(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, const unsigned int c) {
|
||||
return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
|
||||
c_v128_shr_s32(a.v128[0], c));
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shl_8(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shl_16(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shl_32(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shr_u8(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shr_u16(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shr_u32(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shr_s8(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shr_s16(a, n);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, const unsigned int n) {
|
||||
return c_v256_shr_s32(a, n);
|
||||
}
|
||||
|
||||
#endif /* _V256_INTRINSICS_C_H */
|
||||
@@ -1,525 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V256_INTRINSICS_V128_H
|
||||
#define _V256_INTRINSICS_V128_H
|
||||
|
||||
#if HAVE_NEON
|
||||
#include "./v128_intrinsics_arm.h"
|
||||
#elif HAVE_SSE2
|
||||
#include "./v128_intrinsics_x86.h"
|
||||
#else
|
||||
#include "./v128_intrinsics.h"
|
||||
#endif
|
||||
|
||||
typedef struct { v128 lo, hi; } v256;
|
||||
|
||||
SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
|
||||
|
||||
SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
|
||||
|
||||
SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
|
||||
|
||||
SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
|
||||
|
||||
SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
|
||||
v256 t;
|
||||
t.hi = hi;
|
||||
t.lo = lo;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
|
||||
return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
|
||||
return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_load_unaligned(const void *p) {
|
||||
return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
|
||||
v128_load_unaligned(p));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_load_aligned(const void *p) {
|
||||
return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
|
||||
v128_load_aligned(p));
|
||||
}
|
||||
|
||||
SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
|
||||
v128_store_unaligned(p, a.lo);
|
||||
v128_store_unaligned((uint8_t *)p + 16, a.hi);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
|
||||
v128_store_aligned(p, a.lo);
|
||||
v128_store_aligned((uint8_t *)p + 16, a.hi);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zero() {
|
||||
return v256_from_v128(v128_zero(), v128_zero());
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_dup_8(uint8_t x) {
|
||||
v128 t = v128_dup_8(x);
|
||||
return v256_from_v128(t, t);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_dup_16(uint16_t x) {
|
||||
v128 t = v128_dup_16(x);
|
||||
return v256_from_v128(t, t);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_dup_32(uint32_t x) {
|
||||
v128 t = v128_dup_32(x);
|
||||
return v256_from_v128(t, t);
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
|
||||
return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
|
||||
return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
sad128_internal hi;
|
||||
sad128_internal lo;
|
||||
} sad256_internal;
|
||||
|
||||
SIMD_INLINE sad256_internal v256_sad_u8_init() {
|
||||
sad256_internal t;
|
||||
t.hi = v128_sad_u8_init();
|
||||
t.lo = v128_sad_u8_init();
|
||||
return t;
|
||||
}
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v256_sad_u8_sum().
|
||||
The result for more than 16 v256_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
|
||||
sad256_internal t;
|
||||
t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
|
||||
t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
|
||||
return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
ssd128_internal hi;
|
||||
ssd128_internal lo;
|
||||
} ssd256_internal;
|
||||
|
||||
SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
|
||||
ssd256_internal t;
|
||||
t.hi = v128_ssd_u8_init();
|
||||
t.lo = v128_ssd_u8_init();
|
||||
return t;
|
||||
}
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v256_ssd_u8_sum(). */
|
||||
SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
|
||||
ssd256_internal t;
|
||||
t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
|
||||
t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
|
||||
return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_or(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_and(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_padd_s16(v256 a) {
|
||||
return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_abs_s16(v256 a) {
|
||||
return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
|
||||
v128 lo_bits = v128_mullo_s16(a, b);
|
||||
v128 hi_bits = v128_mulhi_s16(a, b);
|
||||
return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
|
||||
v128_ziplo_16(hi_bits, lo_bits));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
|
||||
return v256_from_v128(a.lo, b.lo);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
|
||||
return v256_from_v128(a.hi, b.hi);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
|
||||
return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
|
||||
return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
|
||||
return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
|
||||
v128_unziplo_16(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
|
||||
v128_unziphi_16(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
|
||||
v128_unziplo_32(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
|
||||
v128_unziphi_32(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
|
||||
return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
|
||||
v128_pack_s32_s16(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
|
||||
v128_pack_s16_u8(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
|
||||
v128_pack_s16_s8(b.hi, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
|
||||
return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
|
||||
return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
|
||||
v128_unpacklo_u16_s32(a.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
|
||||
v128_unpacklo_s16_s32(a.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
|
||||
v128_unpacklo_u16_s32(a.hi));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
|
||||
v128_unpacklo_s16_s32(a.hi));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
|
||||
v128 c16 = v128_dup_8(16);
|
||||
v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
|
||||
v128 masklo = v128_cmplt_s8(pattern.lo, c16);
|
||||
return v256_from_v128(
|
||||
v128_or(
|
||||
v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
|
||||
v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
|
||||
v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
|
||||
v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
|
||||
masklo)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
|
||||
return v256_from_v128(
|
||||
v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
|
||||
v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
|
||||
return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
|
||||
}
|
||||
|
||||
/* These intrinsics require immediate values, so we must use #defines
|
||||
to enforce that. */
|
||||
#define v256_shl_n_byte(a, n) \
|
||||
((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n), \
|
||||
v128_shr_n_byte(a.lo, 16 - (n))), \
|
||||
v128_shl_n_byte(a.lo, (n))) \
|
||||
: v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \
|
||||
v128_zero()))
|
||||
|
||||
#define v256_shr_n_byte(a, n) \
|
||||
((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \
|
||||
v128_or(v128_shr_n_byte(a.lo, n), \
|
||||
v128_shl_n_byte(a.hi, 16 - (n)))) \
|
||||
: v256_from_v128(v128_zero(), \
|
||||
(n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi))
|
||||
|
||||
#define v256_align(a, b, c) \
|
||||
((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
|
||||
|
||||
#define v256_shl_n_8(a, n) \
|
||||
v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
|
||||
#define v256_shl_n_16(a, n) \
|
||||
v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
|
||||
#define v256_shl_n_32(a, n) \
|
||||
v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
|
||||
#define v256_shr_n_u8(a, n) \
|
||||
v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
|
||||
#define v256_shr_n_u16(a, n) \
|
||||
v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
|
||||
#define v256_shr_n_u32(a, n) \
|
||||
v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
|
||||
#define v256_shr_n_s8(a, n) \
|
||||
v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
|
||||
#define v256_shr_n_s16(a, n) \
|
||||
v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
|
||||
#define v256_shr_n_s32(a, n) \
|
||||
v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
|
||||
|
||||
#endif /* _V256_INTRINSICS_V128_H */
|
||||
@@ -1,528 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V256_INTRINSICS_H
|
||||
#define _V256_INTRINSICS_H
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
#include "./v256_intrinsics_v128.h"
|
||||
|
||||
#else
|
||||
|
||||
// The _m256i type seems to cause problems for g++'s mangling prior to
|
||||
// version 5, but adding -fabi-version=0 fixes this.
|
||||
#if !defined(__clang__) && __GNUC__ < 5 && defined(__AVX2__) && \
|
||||
defined(__cplusplus)
|
||||
#pragma GCC optimize "-fabi-version=0"
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "./v128_intrinsics_x86.h"
|
||||
|
||||
typedef __m256i v256;
|
||||
|
||||
SIMD_INLINE uint32_t v256_low_u32(v256 a) {
|
||||
return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v256_low_v64(v256 a) {
|
||||
return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v256_low_v128(v256 a) {
|
||||
return _mm256_extracti128_si256(a, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE v128 v256_high_v128(v256 a) {
|
||||
return _mm256_extracti128_si256(a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
|
||||
// gcc seems to be missing _mm256_set_m128i()
|
||||
return _mm256_insertf128_si256(
|
||||
_mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
|
||||
return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
|
||||
return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_load_aligned(const void *p) {
|
||||
return _mm256_load_si256((const __m256i *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_load_unaligned(const void *p) {
|
||||
return _mm256_loadu_si256((const __m256i *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
|
||||
_mm256_store_si256((__m256i *)p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
|
||||
_mm256_storeu_si256((__m256i *)p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
|
||||
|
||||
SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
|
||||
|
||||
SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
|
||||
|
||||
SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
|
||||
|
||||
SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
|
||||
return _mm256_adds_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_padd_s16(v256 a) {
|
||||
return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
|
||||
return _mm256_subs_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
|
||||
|
||||
// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
|
||||
// lanes of lower or upper halves of a 256bit vector because the
|
||||
// unpack/pack intrinsics operate on the 256 bit input vector as 2
|
||||
// independent 128 bit vectors.
|
||||
SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
|
||||
v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
|
||||
v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
|
||||
v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
|
||||
v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
|
||||
v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
|
||||
v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
|
||||
v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
|
||||
v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
|
||||
return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
|
||||
return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
|
||||
return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
|
||||
return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
|
||||
return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
|
||||
return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
|
||||
v128_unpacklo_u8_s16(v256_low_v128(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
|
||||
v128_unpacklo_u8_s16(v256_high_v128(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
|
||||
return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
|
||||
v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
|
||||
return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
|
||||
return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
|
||||
v128_unpacklo_u16_s32(v256_low_v128(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
|
||||
v128_unpacklo_s16_s32(v256_low_v128(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
|
||||
v128_unpacklo_u16_s32(v256_high_v128(a)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
|
||||
return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
|
||||
v128_unpacklo_s16_s32(v256_high_v128(a)));
|
||||
}
|
||||
SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
|
||||
v128 c16 = v128_dup_8(16);
|
||||
v128 hi = v256_high_v128(pattern);
|
||||
v128 lo = v256_low_v128(pattern);
|
||||
v128 maskhi = v128_cmplt_s8(hi, c16);
|
||||
v128 masklo = v128_cmplt_s8(lo, c16);
|
||||
return v256_from_v128(
|
||||
v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
|
||||
v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
|
||||
maskhi)),
|
||||
v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
|
||||
v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
|
||||
masklo)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
|
||||
return _mm256_shuffle_epi8(a, pattern);
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
|
||||
v256 r = _mm256_madd_epi16(a, b);
|
||||
#if defined(__x86_64__)
|
||||
v128 t;
|
||||
r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
|
||||
_mm256_cvtepi32_epi64(v256_low_v128(r)));
|
||||
t = v256_low_v128(_mm256_add_epi64(
|
||||
r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
|
||||
return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
|
||||
#else
|
||||
v128 l = v256_low_v128(r);
|
||||
v128 h = v256_high_v128(r);
|
||||
return (int64_t)_mm_cvtsi128_si32(l) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
|
||||
(int64_t)_mm_cvtsi128_si32(h) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
|
||||
(int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
|
||||
v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
|
||||
v128 lo = v256_low_v128(t);
|
||||
v128 hi = v256_high_v128(t);
|
||||
lo = v128_add_32(lo, hi);
|
||||
return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
|
||||
}
|
||||
|
||||
typedef v256 sad256_internal;
|
||||
|
||||
SIMD_INLINE sad256_internal v256_sad_u8_init() {
|
||||
return _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v256_sad_sum().
|
||||
The result for more than 32 v256_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
|
||||
return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
|
||||
v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
|
||||
return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
|
||||
}
|
||||
|
||||
typedef v256 ssd256_internal;
|
||||
|
||||
SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
|
||||
return _mm256_setzero_si256();
|
||||
}
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v256_ssd_sum(). */
|
||||
SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
|
||||
v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
|
||||
_mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
|
||||
v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
|
||||
_mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
|
||||
v256 rl = _mm256_madd_epi16(l, l);
|
||||
v256 rh = _mm256_madd_epi16(h, h);
|
||||
v128 c = _mm_cvtsi32_si128(32);
|
||||
rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
|
||||
rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
|
||||
rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
|
||||
rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
|
||||
return _mm256_add_epi64(
|
||||
s,
|
||||
_mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
|
||||
v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
|
||||
return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
|
||||
|
||||
SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
|
||||
v128 lo_bits = v128_mullo_s16(a, b);
|
||||
v128 hi_bits = v128_mulhi_s16(a, b);
|
||||
return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
|
||||
v128_ziplo_16(hi_bits, lo_bits));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
|
||||
return _mm256_mullo_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
|
||||
return _mm256_mulhi_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
|
||||
return _mm256_mullo_epi32(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
|
||||
return _mm256_madd_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
|
||||
return _mm256_maddubs_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
|
||||
return _mm256_sub_epi8(
|
||||
_mm256_avg_epu8(a, b),
|
||||
_mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
|
||||
return _mm256_cmpgt_epi8(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
|
||||
return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
|
||||
return _mm256_cmpeq_epi8(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
|
||||
return _mm256_cmpgt_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
|
||||
return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
|
||||
return _mm256_cmpeq_epi16(a, b);
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
|
||||
return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
|
||||
_mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
|
||||
return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
|
||||
_mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
|
||||
__m128i x = _mm_cvtsi32_si128(c + 8);
|
||||
return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
|
||||
_mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
|
||||
return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
|
||||
return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
|
||||
return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
|
||||
return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
|
||||
return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
|
||||
return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
/* These intrinsics require immediate values, so we must use #defines
|
||||
to enforce that. */
|
||||
// _mm256_slli_si256 works on 128 bit lanes and can't be used
|
||||
#define v256_shl_n_byte(a, n) \
|
||||
((n) < 16 \
|
||||
? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n), \
|
||||
v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \
|
||||
v128_shl_n_byte(v256_low_v128(a), n)) \
|
||||
: v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16), \
|
||||
v128_zero()))
|
||||
|
||||
// _mm256_srli_si256 works on 128 bit lanes and can't be used
|
||||
#define v256_shr_n_byte(a, n) \
|
||||
((n) < 16 \
|
||||
? _mm256_alignr_epi8( \
|
||||
_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
|
||||
: ((n) > 16 \
|
||||
? _mm256_srli_si256( \
|
||||
_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \
|
||||
(n)-16) \
|
||||
: _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1))))
|
||||
|
||||
// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
|
||||
#define v256_align(a, b, c) \
|
||||
((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
|
||||
|
||||
#define v256_shl_n_8(a, c) \
|
||||
_mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
|
||||
_mm256_slli_epi16(a, c))
|
||||
#define v256_shr_n_u8(a, c) \
|
||||
_mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
|
||||
#define v256_shr_n_s8(a, c) \
|
||||
_mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
|
||||
_mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
|
||||
#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
|
||||
#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
|
||||
#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
|
||||
#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
|
||||
#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
|
||||
#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
|
||||
#endif
|
||||
|
||||
#endif /* _V256_INTRINSICS_H */
|
||||
@@ -1,221 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V64_INTRINSICS_H
|
||||
#define _V64_INTRINSICS_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "./v64_intrinsics_c.h"
|
||||
|
||||
/* Fallback to plain, unoptimised C. */
|
||||
|
||||
typedef c_v64 v64;
|
||||
|
||||
SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
|
||||
SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
|
||||
SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
|
||||
SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
|
||||
SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
|
||||
return c_v64_from_32(x, y);
|
||||
}
|
||||
SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
|
||||
SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
|
||||
SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
|
||||
return c_v64_from_16(a, b, c, d);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
|
||||
return c_u32_load_unaligned(p);
|
||||
}
|
||||
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
|
||||
return c_u32_load_aligned(p);
|
||||
}
|
||||
SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
|
||||
c_u32_store_unaligned(p, a);
|
||||
}
|
||||
SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
|
||||
c_u32_store_aligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_load_unaligned(const void *p) {
|
||||
return c_v64_load_unaligned(p);
|
||||
}
|
||||
SIMD_INLINE v64 v64_load_aligned(const void *p) {
|
||||
return c_v64_load_aligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
|
||||
c_v64_store_unaligned(p, a);
|
||||
}
|
||||
SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
|
||||
c_v64_store_aligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) {
|
||||
return c_v64_align(a, b, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
|
||||
SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
|
||||
SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
|
||||
SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
|
||||
|
||||
SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
|
||||
SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
|
||||
SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
|
||||
SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
|
||||
SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
|
||||
SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
|
||||
SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
|
||||
SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
|
||||
SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
|
||||
|
||||
SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
|
||||
SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
|
||||
SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
|
||||
SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
|
||||
SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
|
||||
SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
|
||||
SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
|
||||
SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
|
||||
SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
|
||||
SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
|
||||
SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
|
||||
SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
|
||||
SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
|
||||
return c_v64_pack_s32_s16(a, b);
|
||||
}
|
||||
SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
|
||||
return c_v64_pack_s16_u8(a, b);
|
||||
}
|
||||
SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
|
||||
return c_v64_pack_s16_s8(a, b);
|
||||
}
|
||||
SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
|
||||
return c_v64_unpacklo_u16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
|
||||
return c_v64_unpacklo_s16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
|
||||
return c_v64_unpackhi_u16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
|
||||
return c_v64_unpackhi_s16_s32(a);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
|
||||
return c_v64_shuffle_8(a, pattern);
|
||||
}
|
||||
|
||||
typedef uint32_t sad64_internal;
|
||||
SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
|
||||
SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
|
||||
return c_v64_sad_u8(s, a, b);
|
||||
}
|
||||
SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
|
||||
return c_v64_sad_u8_sum(s);
|
||||
}
|
||||
typedef uint32_t ssd64_internal;
|
||||
SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
|
||||
SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
|
||||
return c_v64_ssd_u8(s, a, b);
|
||||
}
|
||||
SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
|
||||
return c_v64_ssd_u8_sum(s);
|
||||
}
|
||||
SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
|
||||
SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
|
||||
SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
|
||||
SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
|
||||
|
||||
SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
|
||||
SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
|
||||
SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
|
||||
SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
|
||||
SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
|
||||
SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
|
||||
SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
|
||||
SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
|
||||
SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
|
||||
SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
|
||||
SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
|
||||
SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
|
||||
SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
|
||||
SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
|
||||
SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
|
||||
SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
|
||||
SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
|
||||
SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
|
||||
SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
|
||||
SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
|
||||
return c_v64_shr_u16(a, n);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
|
||||
return c_v64_shr_s16(a, n);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
|
||||
SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
|
||||
return c_v64_shr_u32(a, n);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
|
||||
return c_v64_shr_s32(a, n);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_n_byte(v64 a, const unsigned int n) {
|
||||
return c_v64_shr_n_byte(a, n);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int n) {
|
||||
return c_v64_shl_n_byte(a, n);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shl_n_8(v64 a, const unsigned int c) {
|
||||
return c_v64_shl_n_8(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_n_u8(v64 a, const unsigned int c) {
|
||||
return c_v64_shr_n_u8(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_n_s8(v64 a, const unsigned int c) {
|
||||
return c_v64_shr_n_s8(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shl_n_16(v64 a, const unsigned int c) {
|
||||
return c_v64_shl_n_16(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_n_u16(v64 a, const unsigned int c) {
|
||||
return c_v64_shr_n_u16(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_n_s16(v64 a, const unsigned int c) {
|
||||
return c_v64_shr_n_s16(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shl_n_32(v64 a, const unsigned int c) {
|
||||
return c_v64_shl_n_32(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_n_u32(v64 a, const unsigned int c) {
|
||||
return c_v64_shr_n_u32(a, c);
|
||||
}
|
||||
SIMD_INLINE v64 v64_shr_n_s32(v64 a, const unsigned int c) {
|
||||
return c_v64_shr_n_s32(a, c);
|
||||
}
|
||||
|
||||
#endif /* _V64_INTRINSICS_H */
|
||||
@@ -1,578 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V64_INTRINSICS_H
|
||||
#define _V64_INTRINSICS_H
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./v64_intrinsics_arm.h"
|
||||
#include "aom_ports/arm.h"
|
||||
|
||||
#ifdef AOM_INCOMPATIBLE_GCC
|
||||
#error Incompatible gcc
|
||||
#endif
|
||||
|
||||
typedef int64x1_t v64;
|
||||
|
||||
SIMD_INLINE uint32_t v64_low_u32(v64 a) {
|
||||
return vget_lane_u32(vreinterpret_u32_s64(a), 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v64_high_u32(v64 a) {
|
||||
return vget_lane_u32(vreinterpret_u32_s64(a), 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE int32_t v64_low_s32(v64 a) {
|
||||
return vget_lane_s32(vreinterpret_s32_s64(a), 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE int32_t v64_high_s32(v64 a) {
|
||||
return vget_lane_s32(vreinterpret_s32_s64(a), 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
|
||||
return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
|
||||
d);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
|
||||
return vcreate_s64((uint64_t)x << 32 | y);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
|
||||
|
||||
SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
|
||||
|
||||
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
|
||||
return *((uint32_t *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
|
||||
return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
|
||||
*((uint32_t *)p) = a;
|
||||
}
|
||||
|
||||
SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
|
||||
#if __clang__
|
||||
vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
|
||||
0);
|
||||
#elif __CC_ARM
|
||||
*(__packed uint32_t *)p) = a;
|
||||
#elif __GNUC__
|
||||
*((__attribute((packed)) uint32_t *)p) = a;
|
||||
#else
|
||||
vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
|
||||
0);
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_load_aligned(const void *p) {
|
||||
return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_load_unaligned(const void *p) {
|
||||
return v64_load_aligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
|
||||
vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
|
||||
}
|
||||
|
||||
SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
|
||||
vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
|
||||
}
|
||||
|
||||
// The following function requires an immediate.
|
||||
// Some compilers will check this if it's optimising, others wont.
|
||||
SIMD_INLINE v64 v64_align(v64 a, v64 b, const unsigned int c) {
|
||||
#if __OPTIMIZE__ && !__clang__
|
||||
return c ? vreinterpret_s64_s8(
|
||||
vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
|
||||
: b;
|
||||
#else
|
||||
return c ? v64_from_64((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)
|
||||
: b;
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
|
||||
|
||||
SIMD_INLINE v64 v64_ones() { return vreinterpret_s64_u8(vdup_n_u8(-1)); }
|
||||
|
||||
SIMD_INLINE v64 v64_dup_8(uint8_t x) {
|
||||
return vreinterpret_s64_u8(vdup_n_u8(x));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_dup_16(uint16_t x) {
|
||||
return vreinterpret_s64_u16(vdup_n_u16(x));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_dup_32(uint32_t x) {
|
||||
return vreinterpret_s64_u32(vdup_n_u32(x));
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
|
||||
int64x2_t r = vpaddlq_s32(vpaddlq_s16(
|
||||
vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
|
||||
vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
|
||||
return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
|
||||
int64x2_t r =
|
||||
vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
|
||||
return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
|
||||
return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
|
||||
}
|
||||
|
||||
typedef uint16x8_t sad64_internal;
|
||||
|
||||
SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v64_sad_u8_sum().
|
||||
The result for more than 32 v64_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
|
||||
return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
|
||||
uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
|
||||
return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
|
||||
}
|
||||
|
||||
typedef int64x1_t ssd64_internal;
|
||||
|
||||
SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
|
||||
return (ssd64_internal)(uint64_t)0;
|
||||
}
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v64_ssd_u8_sum(). */
|
||||
SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
|
||||
uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
|
||||
uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
|
||||
return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r)));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
|
||||
return (uint32_t)(uint64_t)s;
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
|
||||
|
||||
SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(
|
||||
vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(
|
||||
vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u32(
|
||||
vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(
|
||||
vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(
|
||||
vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s8(
|
||||
vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s32(
|
||||
vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_abs_s16(v64 x) {
|
||||
return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(
|
||||
vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
|
||||
vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s32(
|
||||
vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
|
||||
int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
|
||||
return vreinterpret_s64_s32(
|
||||
vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
|
||||
vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
|
||||
vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
|
||||
vreinterpret_s8_s64(y)),
|
||||
vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u16(
|
||||
vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s8(
|
||||
vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s8(
|
||||
vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(
|
||||
vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(
|
||||
vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
|
||||
uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
|
||||
return vreinterpret_s64_u8(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
|
||||
uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
|
||||
return vreinterpret_s64_u8(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
|
||||
int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
|
||||
return vreinterpret_s64_s16(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
|
||||
int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
|
||||
return vreinterpret_s64_s16(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
|
||||
int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
|
||||
return vreinterpret_s64_s32(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
|
||||
int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
|
||||
return vreinterpret_s64_s32(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
|
||||
return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
|
||||
return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s16(vqmovn_s32(
|
||||
vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
|
||||
vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
|
||||
vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
|
||||
uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
|
||||
return vreinterpret_s64_u8(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
|
||||
uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
|
||||
return vreinterpret_s64_u8(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
|
||||
uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
|
||||
return vreinterpret_s64_u16(r.val[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
|
||||
uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
|
||||
return vreinterpret_s64_u16(r.val[1]);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
|
||||
return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
|
||||
return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
|
||||
return vreinterpret_s64_s32(
|
||||
vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
|
||||
return vreinterpret_s64_u32(
|
||||
vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
|
||||
return vreinterpret_s64_u8(
|
||||
vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u8(
|
||||
vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u16(
|
||||
vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u16(
|
||||
vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
|
||||
return vreinterpret_s64_u16(
|
||||
vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_u16(
|
||||
vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_s16(
|
||||
vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_u32(
|
||||
vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
|
||||
return vreinterpret_s64_s32(
|
||||
vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
|
||||
}
|
||||
|
||||
// The following functions require an immediate.
|
||||
// Some compilers will check this during optimisation, others wont.
|
||||
#if __OPTIMIZE__ && !__clang__
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) {
|
||||
return vshl_n_s64(a, c * 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_byte(v64 a, const unsigned int c) {
|
||||
return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_8(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_u8(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_s8(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_16(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_u16(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_s16(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_32(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_u32(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_s32(v64 a, const unsigned int c) {
|
||||
return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_byte(v64 a, const unsigned int c) {
|
||||
return v64_from_64(v64_u64(a) << c * 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_byte(v64 a, const unsigned int c) {
|
||||
return v64_from_64(v64_u64(a) >> c * 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_8(v64 a, const unsigned int c) {
|
||||
return v64_shl_8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_u8(v64 a, const unsigned int c) {
|
||||
return v64_shr_u8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_s8(v64 a, const unsigned int c) {
|
||||
return v64_shr_s8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_16(v64 a, const unsigned int c) {
|
||||
return v64_shl_16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_u16(v64 a, const unsigned int c) {
|
||||
return v64_shr_u16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_s16(v64 a, const unsigned int c) {
|
||||
return v64_shr_s16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_n_32(v64 a, const unsigned int c) {
|
||||
return v64_shl_32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_u32(v64 a, const unsigned int c) {
|
||||
return v64_shr_u32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_n_s32(v64 a, const unsigned int c) {
|
||||
return v64_shr_s32(a, c);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* _V64_INTRINSICS_H */
|
||||
@@ -1,887 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V64_INTRINSICS_C_H
|
||||
#define _V64_INTRINSICS_C_H
|
||||
|
||||
/* Note: This implements the intrinsics in plain, unoptimised C.
|
||||
Intended for reference, porting or debugging. */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "./aom_config.h"
|
||||
|
||||
extern const int simd_check;
|
||||
|
||||
typedef union {
|
||||
uint8_t u8[8];
|
||||
uint16_t u16[4];
|
||||
uint32_t u32[2];
|
||||
uint64_t u64;
|
||||
int8_t s8[8];
|
||||
int16_t s16[4];
|
||||
int32_t s32[2];
|
||||
int64_t s64;
|
||||
} c_v64;
|
||||
|
||||
SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
|
||||
|
||||
SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
|
||||
return a.u32[!CONFIG_BIG_ENDIAN];
|
||||
}
|
||||
|
||||
SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
|
||||
|
||||
SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
|
||||
return a.s32[!CONFIG_BIG_ENDIAN];
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
|
||||
c_v64 t;
|
||||
t.u32[!CONFIG_BIG_ENDIAN] = x;
|
||||
t.u32[CONFIG_BIG_ENDIAN] = y;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
|
||||
c_v64 t;
|
||||
t.u64 = x;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
|
||||
uint16_t d) {
|
||||
c_v64 t;
|
||||
if (CONFIG_BIG_ENDIAN) {
|
||||
t.u16[0] = a;
|
||||
t.u16[1] = b;
|
||||
t.u16[2] = c;
|
||||
t.u16[3] = d;
|
||||
} else {
|
||||
t.u16[3] = a;
|
||||
t.u16[2] = b;
|
||||
t.u16[1] = c;
|
||||
t.u16[0] = d;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
|
||||
uint32_t t;
|
||||
uint8_t *pp = (uint8_t *)p;
|
||||
uint8_t *q = (uint8_t *)&t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) q[c] = pp[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
|
||||
uint8_t *pp = (uint8_t *)p;
|
||||
uint8_t *q = (uint8_t *)&a;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) pp[c] = q[c];
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
|
||||
if (simd_check && (uintptr_t)p & 3) {
|
||||
fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
return c_u32_load_unaligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
|
||||
if (simd_check && (uintptr_t)p & 3) {
|
||||
fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
c_u32_store_unaligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
|
||||
c_v64 t;
|
||||
uint8_t *pp = (uint8_t *)p;
|
||||
uint8_t *q = (uint8_t *)&t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) q[c] = pp[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
|
||||
if (simd_check && (uintptr_t)p & 7) {
|
||||
fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
return c_v64_load_unaligned(p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
|
||||
uint8_t *q = (uint8_t *)p;
|
||||
uint8_t *r = (uint8_t *)&a;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) q[c] = r[c];
|
||||
}
|
||||
|
||||
SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
|
||||
if (simd_check && (uintptr_t)p & 7) {
|
||||
fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
|
||||
abort();
|
||||
}
|
||||
c_v64_store_unaligned(p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_zero() {
|
||||
c_v64 t;
|
||||
t.u64 = 0;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
|
||||
c_v64 t;
|
||||
t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
|
||||
t.u8[7] = x;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
|
||||
c_v64 t;
|
||||
t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
|
||||
c_v64 t;
|
||||
t.u32[0] = t.u32[1] = x;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++)
|
||||
t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
|
||||
? 32767
|
||||
: (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
|
||||
? -32768
|
||||
: (int32_t)a.s16[c] + (int32_t)b.s16[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.u32[0] = a.u32[0] + b.u32[0];
|
||||
t.u32[1] = a.u32[1] + b.u32[1];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++)
|
||||
t.u8[c] = (int32_t)((uint32_t)a.u8[c] - (uint32_t)b.u8[c]) < 0
|
||||
? 0
|
||||
: a.u8[c] - b.u8[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) {
|
||||
int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
|
||||
t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++)
|
||||
t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
|
||||
? -32768
|
||||
: (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
|
||||
? 32767
|
||||
: (int32_t)a.s16[c] - (int32_t)b.s16[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.u32[0] = a.u32[0] - b.u32[0];
|
||||
t.u32[1] = a.u32[1] - b.u32[1];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++)
|
||||
t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
|
||||
c_v64 t;
|
||||
if (mode) {
|
||||
t.u8[7] = a.u8[7];
|
||||
t.u8[6] = b.u8[7];
|
||||
t.u8[5] = a.u8[6];
|
||||
t.u8[4] = b.u8[6];
|
||||
t.u8[3] = a.u8[5];
|
||||
t.u8[2] = b.u8[5];
|
||||
t.u8[1] = a.u8[4];
|
||||
t.u8[0] = b.u8[4];
|
||||
} else {
|
||||
t.u8[7] = a.u8[3];
|
||||
t.u8[6] = b.u8[3];
|
||||
t.u8[5] = a.u8[2];
|
||||
t.u8[4] = b.u8[2];
|
||||
t.u8[3] = a.u8[1];
|
||||
t.u8[2] = b.u8[1];
|
||||
t.u8[1] = a.u8[0];
|
||||
t.u8[0] = b.u8[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
|
||||
c_v64 t;
|
||||
if (mode) {
|
||||
t.u16[3] = a.u16[3];
|
||||
t.u16[2] = b.u16[3];
|
||||
t.u16[1] = a.u16[2];
|
||||
t.u16[0] = b.u16[2];
|
||||
} else {
|
||||
t.u16[3] = a.u16[1];
|
||||
t.u16[2] = b.u16[1];
|
||||
t.u16[1] = a.u16[0];
|
||||
t.u16[0] = b.u16[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
|
||||
c_v64 t;
|
||||
if (mode) {
|
||||
t.u32[1] = a.u32[1];
|
||||
t.u32[0] = b.u32[1];
|
||||
} else {
|
||||
t.u32[1] = a.u32[0];
|
||||
t.u32[0] = b.u32[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
|
||||
c_v64 t;
|
||||
if (mode) {
|
||||
t.u8[7] = b.u8[7];
|
||||
t.u8[6] = b.u8[5];
|
||||
t.u8[5] = b.u8[3];
|
||||
t.u8[4] = b.u8[1];
|
||||
t.u8[3] = a.u8[7];
|
||||
t.u8[2] = a.u8[5];
|
||||
t.u8[1] = a.u8[3];
|
||||
t.u8[0] = a.u8[1];
|
||||
} else {
|
||||
t.u8[7] = a.u8[6];
|
||||
t.u8[6] = a.u8[4];
|
||||
t.u8[5] = a.u8[2];
|
||||
t.u8[4] = a.u8[0];
|
||||
t.u8[3] = b.u8[6];
|
||||
t.u8[2] = b.u8[4];
|
||||
t.u8[1] = b.u8[2];
|
||||
t.u8[0] = b.u8[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
|
||||
c_v64 t;
|
||||
if (mode) {
|
||||
t.u16[3] = b.u16[3];
|
||||
t.u16[2] = b.u16[1];
|
||||
t.u16[1] = a.u16[3];
|
||||
t.u16[0] = a.u16[1];
|
||||
} else {
|
||||
t.u16[3] = a.u16[2];
|
||||
t.u16[2] = a.u16[0];
|
||||
t.u16[1] = b.u16[2];
|
||||
t.u16[0] = b.u16[0];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
|
||||
: _c_v64_unzip_16(a, b, 0);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
|
||||
return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
|
||||
: _c_v64_unzip_16(b, a, 1);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
|
||||
c_v64 t;
|
||||
int endian = !!CONFIG_BIG_ENDIAN * 4;
|
||||
t.s16[3] = (int16_t)a.u8[3 + endian];
|
||||
t.s16[2] = (int16_t)a.u8[2 + endian];
|
||||
t.s16[1] = (int16_t)a.u8[1 + endian];
|
||||
t.s16[0] = (int16_t)a.u8[0 + endian];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
|
||||
c_v64 t;
|
||||
int endian = !!CONFIG_BIG_ENDIAN * 4;
|
||||
t.s16[3] = (int16_t)a.u8[7 - endian];
|
||||
t.s16[2] = (int16_t)a.u8[6 - endian];
|
||||
t.s16[1] = (int16_t)a.u8[5 - endian];
|
||||
t.s16[0] = (int16_t)a.u8[4 - endian];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
if (CONFIG_BIG_ENDIAN) {
|
||||
c_v64 u = a;
|
||||
a = b;
|
||||
b = u;
|
||||
}
|
||||
t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
|
||||
t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
|
||||
t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
|
||||
t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
if (CONFIG_BIG_ENDIAN) {
|
||||
c_v64 u = a;
|
||||
a = b;
|
||||
b = u;
|
||||
}
|
||||
t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
|
||||
t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
|
||||
t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
|
||||
t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
|
||||
t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
|
||||
t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
|
||||
t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
|
||||
t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
if (CONFIG_BIG_ENDIAN) {
|
||||
c_v64 u = a;
|
||||
a = b;
|
||||
b = u;
|
||||
}
|
||||
t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
|
||||
t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
|
||||
t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
|
||||
t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
|
||||
t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
|
||||
t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
|
||||
t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
|
||||
t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
|
||||
c_v64 t;
|
||||
t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
|
||||
t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
|
||||
c_v64 t;
|
||||
t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
|
||||
t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
|
||||
c_v64 t;
|
||||
t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
|
||||
t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
|
||||
c_v64 t;
|
||||
t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
|
||||
t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) {
|
||||
if (simd_check && (pattern.u8[c] & ~7)) {
|
||||
fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
|
||||
pattern.u8[c], c);
|
||||
abort();
|
||||
}
|
||||
t.u8[c] =
|
||||
a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
|
||||
return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
|
||||
a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
|
||||
a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
|
||||
return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
|
||||
(int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
|
||||
return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
|
||||
a.u8[0];
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
|
||||
return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
|
||||
}
|
||||
|
||||
typedef uint32_t c_sad64_internal;
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v64_sad_u8_sum().
|
||||
The result for more than 32 v64_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
|
||||
|
||||
SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
|
||||
c_v64 b) {
|
||||
int c;
|
||||
for (c = 0; c < 8; c++)
|
||||
s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
|
||||
return s;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
|
||||
|
||||
typedef uint32_t c_ssd64_internal;
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v64_ssd_u8_sum(). */
|
||||
SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
|
||||
|
||||
SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
|
||||
c_v64 b) {
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
|
||||
return s;
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.u64 = a.u64 | b.u64;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.u64 = a.u64 ^ b.u64;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.u64 = a.u64 & b.u64;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.u64 = a.u64 & ~b.u64;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.s32[0] = a.s32[0] * b.s32[0];
|
||||
t.s32[1] = a.s32[1] * b.s32[1];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
|
||||
t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int32_t u;
|
||||
u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
|
||||
t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
|
||||
u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
|
||||
t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
|
||||
u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
|
||||
t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
|
||||
u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
|
||||
t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
if (simd_check && n > 7) {
|
||||
fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
|
||||
abort();
|
||||
}
|
||||
for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
if (simd_check && n > 7) {
|
||||
fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
|
||||
abort();
|
||||
}
|
||||
for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
if (simd_check && n > 7) {
|
||||
fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
|
||||
abort();
|
||||
}
|
||||
for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
if (simd_check && n > 15) {
|
||||
fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
|
||||
abort();
|
||||
}
|
||||
for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
if (simd_check && n > 15) {
|
||||
fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
|
||||
abort();
|
||||
}
|
||||
for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
int c;
|
||||
if (simd_check && n > 15) {
|
||||
fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
|
||||
abort();
|
||||
}
|
||||
for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
if (simd_check && n > 31) {
|
||||
fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
|
||||
abort();
|
||||
}
|
||||
t.u32[1] = a.u32[1] << n;
|
||||
t.u32[0] = a.u32[0] << n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
if (simd_check && n > 31) {
|
||||
fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
|
||||
abort();
|
||||
}
|
||||
t.u32[1] = a.u32[1] >> n;
|
||||
t.u32[0] = a.u32[0] >> n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
|
||||
c_v64 t;
|
||||
if (simd_check && n > 31) {
|
||||
fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
|
||||
abort();
|
||||
}
|
||||
t.s32[1] = a.s32[1] >> n;
|
||||
t.s32[0] = a.s32[0] >> n;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, const unsigned int i) {
|
||||
c_v64 t;
|
||||
t.u64 = x.u64 >> i * 8;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, const unsigned int i) {
|
||||
c_v64 t;
|
||||
t.u64 = x.u64 << i * 8;
|
||||
return t;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, const unsigned int c) {
|
||||
if (simd_check && c > 7) {
|
||||
fprintf(stderr, "Error: undefined alignment %d\n", c);
|
||||
abort();
|
||||
}
|
||||
return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shl_8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shr_u8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shr_s8(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shl_16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shr_u16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shr_s16(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shl_32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shr_u32(a, c);
|
||||
}
|
||||
|
||||
SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, const unsigned int c) {
|
||||
return c_v64_shr_s32(a, c);
|
||||
}
|
||||
|
||||
#endif /* _V64_INTRINSICS_C_H */
|
||||
@@ -1,451 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef _V64_INTRINSICS_H
|
||||
#define _V64_INTRINSICS_H
|
||||
|
||||
#include <emmintrin.h>
|
||||
#if defined(__SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined(__SSE4_1__)
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
typedef __m128i v64;
|
||||
|
||||
SIMD_INLINE uint32_t v64_low_u32(v64 a) {
|
||||
return (uint32_t)_mm_cvtsi128_si32(a);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v64_high_u32(v64 a) {
|
||||
return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
|
||||
}
|
||||
|
||||
SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
|
||||
|
||||
SIMD_INLINE int32_t v64_high_s32(v64 a) {
|
||||
return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
|
||||
return _mm_packs_epi32(
|
||||
_mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
|
||||
_mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
|
||||
return _mm_set_epi32(0, 0, x, y);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_from_64(uint64_t x) {
|
||||
#ifdef __x86_64__
|
||||
return _mm_cvtsi64_si128(x);
|
||||
#else
|
||||
return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v64_u64(v64 x) {
|
||||
return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
|
||||
return *((uint32_t *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
|
||||
return *((uint32_t *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
|
||||
*((uint32_t *)p) = a;
|
||||
}
|
||||
|
||||
SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
|
||||
*((uint32_t *)p) = a;
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_load_aligned(const void *p) {
|
||||
return _mm_loadl_epi64((__m128i *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_load_unaligned(const void *p) {
|
||||
return _mm_loadl_epi64((__m128i *)p);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
|
||||
_mm_storel_epi64((__m128i *)p, a);
|
||||
}
|
||||
|
||||
SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
|
||||
_mm_storel_epi64((__m128i *)p, a);
|
||||
}
|
||||
|
||||
// The following function requires an immediate.
|
||||
#if __OPTIMIZE__
|
||||
#define v64_align(a, b, c) \
|
||||
((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
|
||||
#else
|
||||
#define v64_align(a, b, c) \
|
||||
((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
|
||||
: (b))
|
||||
#endif
|
||||
|
||||
SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
|
||||
|
||||
SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
|
||||
|
||||
SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
|
||||
|
||||
SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
|
||||
|
||||
SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_abs_s16(v64 a) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_abs_epi16(a);
|
||||
#else
|
||||
return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
|
||||
|
||||
SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
|
||||
return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
|
||||
|
||||
SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
|
||||
return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
|
||||
|
||||
SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
|
||||
return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
|
||||
__m128i t = _mm_unpacklo_epi64(b, a);
|
||||
return _mm_packs_epi32(t, t);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
|
||||
__m128i t = _mm_unpacklo_epi64(b, a);
|
||||
return _mm_packus_epi16(t, t);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
|
||||
__m128i t = _mm_unpacklo_epi64(b, a);
|
||||
return _mm_packs_epi16(t, t);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
v64_from_64(0x0f0d0b0907050301LL));
|
||||
#else
|
||||
return _mm_packus_epi16(
|
||||
_mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
|
||||
_mm_setzero_si128());
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
v64_from_64(0x0e0c0a0806040200LL));
|
||||
#else
|
||||
return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
v64_from_64(0x0f0e0b0a07060302LL));
|
||||
#else
|
||||
return _mm_packs_epi32(
|
||||
_mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
|
||||
_mm_setzero_si128());
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
v64_from_64(0x0d0c090805040100LL));
|
||||
#else
|
||||
return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
|
||||
return _mm_unpacklo_epi8(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
|
||||
return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
|
||||
return _mm_unpacklo_epi16(a, _mm_setzero_si128());
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
|
||||
return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
|
||||
return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
|
||||
return _mm_srli_si128(
|
||||
_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(x, pattern);
|
||||
#else
|
||||
v64 output;
|
||||
unsigned char *input = (unsigned char *)&x;
|
||||
unsigned char *index = (unsigned char *)&pattern;
|
||||
char *selected = (char *)&output;
|
||||
int counter;
|
||||
|
||||
for (counter = 0; counter < 8; counter++) {
|
||||
selected[counter] = input[index[counter]];
|
||||
}
|
||||
|
||||
return output;
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
|
||||
__m128i r, r1, r2, z;
|
||||
z = _mm_setzero_si128();
|
||||
r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8),
|
||||
_mm_unpacklo_epi8(b, z));
|
||||
r2 = _mm_srli_si128(r1, 8);
|
||||
r = _mm_add_epi32(r1, r2);
|
||||
r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
|
||||
return ((int32_t)v64_low_u32(r)) >> 8;
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
|
||||
__m128i r = _mm_madd_epi16(a, b);
|
||||
#if defined(__SSE4_1__) && defined(__x86_64__)
|
||||
__m128i x = _mm_cvtepi32_epi64(r);
|
||||
return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
|
||||
#else
|
||||
return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
|
||||
(int64_t)_mm_cvtsi128_si32(r);
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
|
||||
return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
|
||||
}
|
||||
|
||||
SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
|
||||
return v64_dotp_s16(a, v64_dup_16(1));
|
||||
}
|
||||
|
||||
typedef v64 sad64_internal;
|
||||
|
||||
SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
v64_sad_u8_sum().
|
||||
The result for more than 32 v64_sad_u8() calls is undefined. */
|
||||
SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
|
||||
return _mm_add_epi64(s, _mm_sad_epu8(a, b));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
|
||||
|
||||
typedef v64 ssd64_internal;
|
||||
|
||||
SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
|
||||
|
||||
/* Implementation dependent return value. Result must be finalised with
|
||||
* v64_ssd_u8_sum(). */
|
||||
SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
|
||||
v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
|
||||
v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
|
||||
v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
|
||||
return _mm_add_epi64(
|
||||
s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
|
||||
}
|
||||
|
||||
SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
|
||||
|
||||
SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
|
||||
|
||||
SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_mullo_epi32(a, b);
|
||||
#else
|
||||
return _mm_unpacklo_epi32(
|
||||
_mm_mul_epu32(a, b),
|
||||
_mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_maddubs_epi16(a, b);
|
||||
#else
|
||||
__m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
|
||||
_mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
|
||||
return _mm_packs_epi32(t, t);
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
|
||||
return _mm_sub_epi8(_mm_avg_epu8(a, b),
|
||||
_mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_min_epi8(a, b);
|
||||
#else
|
||||
v64 mask = _mm_cmplt_epi8(a, b);
|
||||
return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_max_epi8(a, b);
|
||||
#else
|
||||
v64 mask = _mm_cmplt_epi8(b, a);
|
||||
return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
|
||||
|
||||
SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
|
||||
return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
|
||||
_mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
|
||||
return _mm_and_si128(_mm_set1_epi8(0xff >> c),
|
||||
_mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
|
||||
return _mm_packs_epi16(
|
||||
_mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
|
||||
return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
|
||||
return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
|
||||
return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
|
||||
return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
|
||||
return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
|
||||
return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
|
||||
}
|
||||
|
||||
/* These intrinsics require immediate values, so we must use #defines
|
||||
to enforce that. */
|
||||
#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
|
||||
#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
|
||||
#define v64_shl_n_8(a, c) \
|
||||
_mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
|
||||
#define v64_shr_n_u8(a, c) \
|
||||
_mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
|
||||
#define v64_shr_n_s8(a, c) \
|
||||
_mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
|
||||
#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
|
||||
#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
|
||||
#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
|
||||
#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
|
||||
#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
|
||||
#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
|
||||
|
||||
#endif /* _V64_INTRINSICS_H */
|
||||
1219
aom_dsp/variance.c
1219
aom_dsp/variance.c
File diff suppressed because it is too large
Load Diff
@@ -1,124 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_VARIANCE_H_
|
||||
#define AOM_DSP_VARIANCE_H_
|
||||
|
||||
#include "./aom_config.h"
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define FILTER_BITS 7
|
||||
#define FILTER_WEIGHT 128
|
||||
|
||||
typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride);
|
||||
|
||||
typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
const uint8_t *second_pred);
|
||||
|
||||
typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
|
||||
int b_stride, int n);
|
||||
|
||||
typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sad_array);
|
||||
|
||||
typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *const b_array[],
|
||||
int b_stride, unsigned int *sad_array);
|
||||
|
||||
typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse);
|
||||
|
||||
typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
|
||||
int xoffset, int yoffset,
|
||||
const uint8_t *b, int b_stride,
|
||||
unsigned int *sse);
|
||||
|
||||
typedef unsigned int (*aom_subp_avg_variance_fn_t)(
|
||||
const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
|
||||
int b_stride, unsigned int *sse, const uint8_t *second_pred);
|
||||
|
||||
#if CONFIG_AV1 && CONFIG_EXT_INTER
|
||||
typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
|
||||
const uint8_t *ref, int ref_stride,
|
||||
const uint8_t *msk_ptr,
|
||||
int msk_stride);
|
||||
typedef unsigned int (*aom_masked_variance_fn_t)(
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
|
||||
const uint8_t *msk, int msk_stride, unsigned int *sse);
|
||||
typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset,
|
||||
const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride,
|
||||
unsigned int *sse);
|
||||
#endif // CONFIG_AV1 && CONFIG_EXT_INTER
|
||||
|
||||
#if CONFIG_AV1 && CONFIG_MOTION_VAR
|
||||
typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
|
||||
const int32_t *wsrc,
|
||||
const int32_t *msk);
|
||||
typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
|
||||
int pred_stride,
|
||||
const int32_t *wsrc,
|
||||
const int32_t *msk,
|
||||
unsigned int *sse);
|
||||
typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
|
||||
const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
|
||||
const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
|
||||
#endif // CONFIG_AV1 && CONFIG_MOTION_VAR
|
||||
|
||||
#if CONFIG_AV1
|
||||
typedef struct aom_variance_vtable {
|
||||
aom_sad_fn_t sdf;
|
||||
aom_sad_avg_fn_t sdaf;
|
||||
aom_variance_fn_t vf;
|
||||
aom_subpixvariance_fn_t svf;
|
||||
aom_subp_avg_variance_fn_t svaf;
|
||||
aom_sad_multi_fn_t sdx3f;
|
||||
aom_sad_multi_fn_t sdx8f;
|
||||
aom_sad_multi_d_fn_t sdx4df;
|
||||
#if CONFIG_EXT_INTER
|
||||
aom_masked_sad_fn_t msdf;
|
||||
aom_masked_variance_fn_t mvf;
|
||||
aom_masked_subpixvariance_fn_t msvf;
|
||||
#endif // CONFIG_EXT_INTER
|
||||
#if CONFIG_MOTION_VAR
|
||||
aom_obmc_sad_fn_t osdf;
|
||||
aom_obmc_variance_fn_t ovf;
|
||||
aom_obmc_subpixvariance_fn_t osvf;
|
||||
#endif // CONFIG_MOTION_VAR
|
||||
} aom_variance_fn_ptr_t;
|
||||
#endif // CONFIG_AV1
|
||||
|
||||
void aom_highbd_var_filter_block2d_bil_first_pass(
|
||||
const uint8_t *src_ptr8, uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line, int pixel_step,
|
||||
unsigned int output_height, unsigned int output_width,
|
||||
const uint8_t *filter);
|
||||
|
||||
void aom_highbd_var_filter_block2d_bil_second_pass(
|
||||
const uint16_t *src_ptr, uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line, unsigned int pixel_step,
|
||||
unsigned int output_height, unsigned int output_width,
|
||||
const uint8_t *filter);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // AOM_DSP_VARIANCE_H_
|
||||
@@ -1,36 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
// To start out, just dispatch to the function using the 2D mask and
|
||||
// pass mask stride as 0. This can be improved upon if necessary.
|
||||
|
||||
void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, 0, h, w, 0, 0);
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
void aom_highbd_blend_a64_hmask_sse4_1(
|
||||
uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
|
||||
uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w, int bd) {
|
||||
aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
|
||||
src1_8, src1_stride, mask, 0, h, w, 0, 0,
|
||||
bd);
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
@@ -1,924 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <smmintrin.h> // SSE4.1
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/blend.h"
|
||||
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
#include "aom_dsp/x86/blend_sse4.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// No sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride,
|
||||
int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_m0_b = xx_loadl_32(mask);
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_32(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride,
|
||||
int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_m0_b = xx_loadl_64(mask);
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_64(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_w16n_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 16) {
|
||||
const __m128i v_m0l_b = xx_loadl_64(mask + c);
|
||||
const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
|
||||
const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
|
||||
const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
|
||||
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
|
||||
const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
|
||||
|
||||
const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
|
||||
const __m128i v_resh_w =
|
||||
blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_b);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void blend_a64_mask_sx_w4_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_r_b = xx_loadl_64(mask);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
|
||||
|
||||
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_32(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_sx_w8_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_r_b = xx_loadu_128(mask);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
|
||||
|
||||
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_64(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_sx_w16n_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 16) {
|
||||
const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
|
||||
const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
|
||||
const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
|
||||
const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
|
||||
|
||||
const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
|
||||
const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
|
||||
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
|
||||
const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
|
||||
|
||||
const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
|
||||
const __m128i v_resh_w =
|
||||
blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_b);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Vertical sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void blend_a64_mask_sy_w4_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_ra_b = xx_loadl_32(mask);
|
||||
const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
|
||||
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_32(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_sy_w8_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_ra_b = xx_loadl_64(mask);
|
||||
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
|
||||
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_64(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_sy_w16n_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_zero = _mm_setzero_si128();
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 16) {
|
||||
const __m128i v_ra_b = xx_loadu_128(mask + c);
|
||||
const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
|
||||
|
||||
const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
|
||||
const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
|
||||
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
|
||||
const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
|
||||
|
||||
const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
|
||||
const __m128i v_resh_w =
|
||||
blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_b);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal and Vertical sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void blend_a64_mask_sx_sy_w4_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_ra_b = xx_loadl_64(mask);
|
||||
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
|
||||
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
|
||||
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
|
||||
const __m128i v_rvsb_w =
|
||||
_mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
|
||||
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
|
||||
|
||||
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_32(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_sx_sy_w8_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_ra_b = xx_loadu_128(mask);
|
||||
const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
|
||||
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
|
||||
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
|
||||
const __m128i v_rvsb_w =
|
||||
_mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
|
||||
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
|
||||
|
||||
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_64(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_sx_sy_w16n_sse4_1(
|
||||
uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 16) {
|
||||
const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
|
||||
const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
|
||||
const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
|
||||
const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
|
||||
const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
|
||||
const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
|
||||
const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
|
||||
const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
|
||||
const __m128i v_rvsbl_w =
|
||||
_mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
|
||||
const __m128i v_rvsbh_w =
|
||||
_mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
|
||||
const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
|
||||
const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
|
||||
|
||||
const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
|
||||
const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
|
||||
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
|
||||
const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
|
||||
|
||||
const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w);
|
||||
const __m128i v_resh_w =
|
||||
blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_b);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Dispatch
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h,
|
||||
int w, int suby, int subx) {
|
||||
typedef void (*blend_fn)(
|
||||
uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
|
||||
uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w);
|
||||
|
||||
// Dimensions are: width_index X subx X suby
|
||||
static const blend_fn blend[3][2][2] = {
|
||||
{ // w % 16 == 0
|
||||
{ blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
|
||||
{ blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
|
||||
{ // w == 4
|
||||
{ blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
|
||||
{ blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
|
||||
{ // w == 8
|
||||
{ blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
|
||||
{ blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
|
||||
};
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
|
||||
aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
|
||||
mask, mask_stride, h, w, suby, subx);
|
||||
} else {
|
||||
blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
|
||||
src0_stride, src1, src1_stride,
|
||||
mask, mask_stride, h, w);
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// No sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static INLINE void blend_a64_mask_bn_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
const __m128i v_m0_b = xx_loadl_32(mask);
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storel_64(dst, v_res_w);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, blend_4_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, blend_4_b12);
|
||||
}
|
||||
|
||||
static INLINE void blend_a64_mask_bn_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w,
|
||||
blend_unit_fn blend) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 8) {
|
||||
const __m128i v_m0_b = xx_loadl_64(mask + c);
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_w);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b12);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
const __m128i v_r_b = xx_loadl_64(mask);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
|
||||
|
||||
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storel_64(dst, v_res_w);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_sx_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h,
|
||||
blend_4_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_sx_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h,
|
||||
blend_4_b12);
|
||||
}
|
||||
|
||||
static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w,
|
||||
blend_unit_fn blend) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 8) {
|
||||
const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
|
||||
|
||||
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_w);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_sx_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_sx_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b12);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Vertical sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
const __m128i v_ra_b = xx_loadl_32(mask);
|
||||
const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
|
||||
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storel_64(dst, v_res_w);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_sy_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h,
|
||||
blend_4_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_sy_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h,
|
||||
blend_4_b12);
|
||||
}
|
||||
|
||||
static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w,
|
||||
blend_unit_fn blend) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 8) {
|
||||
const __m128i v_ra_b = xx_loadl_64(mask + c);
|
||||
const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
|
||||
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
|
||||
|
||||
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_w);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_sy_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_sy_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b12);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Horizontal and Vertical sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
const __m128i v_ra_b = xx_loadl_64(mask);
|
||||
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
|
||||
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
|
||||
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
|
||||
const __m128i v_rvsb_w =
|
||||
_mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
|
||||
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
|
||||
|
||||
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storel_64(dst, v_res_w);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h,
|
||||
blend_4_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h,
|
||||
blend_4_b12);
|
||||
}
|
||||
|
||||
static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w,
|
||||
blend_unit_fn blend) {
|
||||
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
|
||||
0xff, 0, 0xff, 0, 0xff, 0, 0xff);
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
for (c = 0; c < w; c += 8) {
|
||||
const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
|
||||
const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
|
||||
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
|
||||
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
|
||||
const __m128i v_rvsb_w =
|
||||
_mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
|
||||
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
|
||||
|
||||
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_w);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 2 * mask_stride;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w) {
|
||||
blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, mask_stride, h, w,
|
||||
blend_8_b12);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Dispatch
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
|
||||
const uint8_t *src0_8,
|
||||
uint32_t src0_stride,
|
||||
const uint8_t *src1_8,
|
||||
uint32_t src1_stride, const uint8_t *mask,
|
||||
uint32_t mask_stride, int h, int w,
|
||||
int suby, int subx, int bd) {
|
||||
typedef void (*blend_fn)(
|
||||
uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, uint32_t mask_stride, int h, int w);
|
||||
|
||||
// Dimensions are: bd_index X width_index X subx X suby
|
||||
static const blend_fn blend[2][2][2][2] = {
|
||||
{ // bd == 8 or 10
|
||||
{ // w % 8 == 0
|
||||
{ blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
|
||||
{ blend_a64_mask_b10_sx_w8n_sse4_1,
|
||||
blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
|
||||
{ // w == 4
|
||||
{ blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
|
||||
{ blend_a64_mask_b10_sx_w4_sse4_1,
|
||||
blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
|
||||
{ // bd == 12
|
||||
{ // w % 8 == 0
|
||||
{ blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
|
||||
{ blend_a64_mask_b12_sx_w8n_sse4_1,
|
||||
blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
|
||||
{ // w == 4
|
||||
{ blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
|
||||
{ blend_a64_mask_b12_sx_w4_sse4_1,
|
||||
blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
|
||||
};
|
||||
|
||||
assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
assert(bd == 8 || bd == 10 || bd == 12);
|
||||
if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
|
||||
aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
|
||||
src1_stride, mask, mask_stride, h, w, suby,
|
||||
subx, bd);
|
||||
} else {
|
||||
uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
|
||||
const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
|
||||
const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
|
||||
|
||||
blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
|
||||
dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
|
||||
mask_stride, h, w);
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
@@ -1,285 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <smmintrin.h> // SSE4.1
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "aom/aom_integer.h"
|
||||
#include "aom_ports/mem.h"
|
||||
#include "aom_dsp/aom_dsp_common.h"
|
||||
#include "aom_dsp/blend.h"
|
||||
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
#include "aom_dsp/x86/blend_sse4.h"
|
||||
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Implementation - No sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_m0_w = _mm_set1_epi16(*mask);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_32(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 1;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
const __m128i v_m0_w = _mm_set1_epi16(*mask);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
|
||||
|
||||
xx_storel_64(dst, v_res_b);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 1;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0,
|
||||
uint32_t src0_stride,
|
||||
const uint8_t *src1,
|
||||
uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
const __m128i v_m0_w = _mm_set1_epi16(*mask);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
for (c = 0; c < w; c += 16) {
|
||||
const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w);
|
||||
const __m128i v_resh_w =
|
||||
blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w);
|
||||
|
||||
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_b);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 1;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Dispatch
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
|
||||
const uint8_t *src0, uint32_t src0_stride,
|
||||
const uint8_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w);
|
||||
|
||||
// Dimension: width_index
|
||||
static const blend_fn blend[9] = {
|
||||
blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
|
||||
aom_blend_a64_vmask_c, // w == 1
|
||||
aom_blend_a64_vmask_c, // w == 2
|
||||
NULL, // INVALID
|
||||
blend_a64_vmask_w4_sse4_1, // w == 4
|
||||
NULL, // INVALID
|
||||
NULL, // INVALID
|
||||
NULL, // INVALID
|
||||
blend_a64_vmask_w8_sse4_1, // w == 8
|
||||
};
|
||||
|
||||
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h,
|
||||
w);
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Implementation - No sub-sampling
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static INLINE void blend_a64_vmask_bn_w4_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, blend_unit_fn blend) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
const __m128i v_m0_w = _mm_set1_epi16(*mask);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
|
||||
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storel_64(dst, v_res_w);
|
||||
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 1;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
|
||||
const uint16_t *src0,
|
||||
uint32_t src0_stride,
|
||||
const uint16_t *src1,
|
||||
uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, h, blend_4_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
|
||||
const uint16_t *src0,
|
||||
uint32_t src0_stride,
|
||||
const uint16_t *src1,
|
||||
uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
(void)w;
|
||||
blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, h, blend_4_b12);
|
||||
}
|
||||
|
||||
static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
|
||||
uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
|
||||
uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w, blend_unit_fn blend) {
|
||||
const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
|
||||
|
||||
do {
|
||||
int c;
|
||||
const __m128i v_m0_w = _mm_set1_epi16(*mask);
|
||||
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
|
||||
for (c = 0; c < w; c += 8) {
|
||||
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
|
||||
|
||||
xx_storeu_128(dst + c, v_res_w);
|
||||
}
|
||||
dst += dst_stride;
|
||||
src0 += src0_stride;
|
||||
src1 += src1_stride;
|
||||
mask += 1;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
|
||||
const uint16_t *src0,
|
||||
uint32_t src0_stride,
|
||||
const uint16_t *src1,
|
||||
uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, h, w, blend_8_b10);
|
||||
}
|
||||
|
||||
static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
|
||||
const uint16_t *src0,
|
||||
uint32_t src0_stride,
|
||||
const uint16_t *src1,
|
||||
uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w) {
|
||||
blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, h, w, blend_8_b12);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Dispatch
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void aom_highbd_blend_a64_vmask_sse4_1(
|
||||
uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
|
||||
uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w, int bd) {
|
||||
typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
|
||||
const uint16_t *src0, uint32_t src0_stride,
|
||||
const uint16_t *src1, uint32_t src1_stride,
|
||||
const uint8_t *mask, int h, int w);
|
||||
|
||||
// Dimensions are: bd_index X width_index
|
||||
static const blend_fn blend[2][2] = {
|
||||
{
|
||||
// bd == 8 or 10
|
||||
blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
|
||||
blend_a64_vmask_b10_w4_sse4_1, // w == 4
|
||||
},
|
||||
{
|
||||
// bd == 12
|
||||
blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
|
||||
blend_a64_vmask_b12_w4_sse4_1, // w == 4
|
||||
}
|
||||
};
|
||||
|
||||
assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
|
||||
assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
|
||||
|
||||
assert(h >= 1);
|
||||
assert(w >= 1);
|
||||
assert(IS_POWER_OF_TWO(h));
|
||||
assert(IS_POWER_OF_TWO(w));
|
||||
|
||||
assert(bd == 8 || bd == 10 || bd == 12);
|
||||
|
||||
if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
|
||||
aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
|
||||
src1_stride, mask, h, w, bd);
|
||||
} else {
|
||||
uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
|
||||
const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
|
||||
const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
|
||||
|
||||
blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
|
||||
src1_stride, mask, h, w);
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
@@ -1,146 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_X86_BLEND_SSE4_H_
|
||||
#define AOM_DSP_X86_BLEND_SSE4_H_
|
||||
|
||||
#include "aom_dsp/blend.h"
|
||||
#include "aom_dsp/x86/synonyms.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Common kernels
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
|
||||
const __m128i v_m0_w, const __m128i v_m1_w) {
|
||||
const __m128i v_s0_b = xx_loadl_32(src0);
|
||||
const __m128i v_s1_b = xx_loadl_32(src1);
|
||||
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
|
||||
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
|
||||
|
||||
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
|
||||
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
|
||||
|
||||
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
|
||||
|
||||
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
|
||||
|
||||
return v_res_w;
|
||||
}
|
||||
|
||||
static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
|
||||
const __m128i v_m0_w, const __m128i v_m1_w) {
|
||||
const __m128i v_s0_b = xx_loadl_64(src0);
|
||||
const __m128i v_s1_b = xx_loadl_64(src1);
|
||||
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
|
||||
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
|
||||
|
||||
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
|
||||
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
|
||||
|
||||
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
|
||||
|
||||
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
|
||||
|
||||
return v_res_w;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
|
||||
const __m128i v_m0_w, const __m128i v_m1_w);
|
||||
|
||||
static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
|
||||
const __m128i v_m0_w, const __m128i v_m1_w) {
|
||||
const __m128i v_s0_w = xx_loadl_64(src0);
|
||||
const __m128i v_s1_w = xx_loadl_64(src1);
|
||||
|
||||
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
|
||||
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
|
||||
|
||||
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
|
||||
|
||||
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
|
||||
|
||||
return v_res_w;
|
||||
}
|
||||
|
||||
static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
|
||||
const __m128i v_m0_w, const __m128i v_m1_w) {
|
||||
const __m128i v_s0_w = xx_loadu_128(src0);
|
||||
const __m128i v_s1_w = xx_loadu_128(src1);
|
||||
|
||||
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
|
||||
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
|
||||
|
||||
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
|
||||
|
||||
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
|
||||
|
||||
return v_res_w;
|
||||
}
|
||||
|
||||
static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
|
||||
const __m128i v_m0_w, const __m128i v_m1_w) {
|
||||
const __m128i v_s0_w = xx_loadl_64(src0);
|
||||
const __m128i v_s1_w = xx_loadl_64(src1);
|
||||
|
||||
// Interleave
|
||||
const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
|
||||
const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
|
||||
|
||||
// Multiply-Add
|
||||
const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
|
||||
|
||||
// Scale
|
||||
const __m128i v_ssum_d =
|
||||
_mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
|
||||
|
||||
// Pack
|
||||
const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
|
||||
|
||||
// Round
|
||||
const __m128i v_res_w = xx_round_epu16(v_pssum_d);
|
||||
|
||||
return v_res_w;
|
||||
}
|
||||
|
||||
static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
|
||||
const __m128i v_m0_w, const __m128i v_m1_w) {
|
||||
const __m128i v_s0_w = xx_loadu_128(src0);
|
||||
const __m128i v_s1_w = xx_loadu_128(src1);
|
||||
|
||||
// Interleave
|
||||
const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
|
||||
const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
|
||||
const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
|
||||
const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
|
||||
|
||||
// Multiply-Add
|
||||
const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
|
||||
const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
|
||||
|
||||
// Scale
|
||||
const __m128i v_ssuml_d =
|
||||
_mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
|
||||
const __m128i v_ssumh_d =
|
||||
_mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
|
||||
|
||||
// Pack
|
||||
const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
|
||||
|
||||
// Round
|
||||
const __m128i v_res_w = xx_round_epu16(v_pssum_d);
|
||||
|
||||
return v_res_w;
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
|
||||
#endif // AOM_DSP_X86_BLEND_SSE4_H_
|
||||
@@ -1,862 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h> // SSE2
|
||||
|
||||
#include "aom_dsp/fwd_txfm.h"
|
||||
#include "aom_dsp/txfm_common.h"
|
||||
#include "aom_dsp/x86/txfm_common_sse2.h"
|
||||
|
||||
// Apply a 32-element IDCT to 8 columns. This does not do any transposition
|
||||
// of its output - the caller is expected to do that.
|
||||
// The input buffers are the top and bottom halves of an 8x32 block.
|
||||
void fdct32_8col(__m128i *in0, __m128i *in1) {
|
||||
// Constants
|
||||
// When we use them, in one case, they are all the same. In all others
|
||||
// it's a pair of them that we need to repeat four times. This is done
|
||||
// by constructing the 32 bit constant corresponding to that pair.
|
||||
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
|
||||
const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
|
||||
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
|
||||
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
|
||||
const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
|
||||
const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
|
||||
const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
|
||||
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
|
||||
const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
|
||||
const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
|
||||
const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
|
||||
const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
|
||||
const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
|
||||
const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
|
||||
const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
|
||||
const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
|
||||
const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
|
||||
const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
|
||||
const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
|
||||
const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
|
||||
const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
|
||||
const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
|
||||
const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
|
||||
const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
|
||||
const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
|
||||
const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
|
||||
const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
|
||||
const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
|
||||
const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
|
||||
const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
|
||||
const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
|
||||
const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
|
||||
const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
|
||||
const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
|
||||
const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
|
||||
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
|
||||
__m128i step1[32];
|
||||
__m128i step2[32];
|
||||
__m128i step3[32];
|
||||
__m128i out[32];
|
||||
// Stage 1
|
||||
{
|
||||
const __m128i *ina = in0;
|
||||
const __m128i *inb = in1 + 15;
|
||||
__m128i *step1a = &step1[0];
|
||||
__m128i *step1b = &step1[31];
|
||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
|
||||
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
|
||||
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
|
||||
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
|
||||
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
|
||||
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
|
||||
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
|
||||
step1a[0] = _mm_add_epi16(ina0, inb0);
|
||||
step1a[1] = _mm_add_epi16(ina1, inb1);
|
||||
step1a[2] = _mm_add_epi16(ina2, inb2);
|
||||
step1a[3] = _mm_add_epi16(ina3, inb3);
|
||||
step1b[-3] = _mm_sub_epi16(ina3, inb3);
|
||||
step1b[-2] = _mm_sub_epi16(ina2, inb2);
|
||||
step1b[-1] = _mm_sub_epi16(ina1, inb1);
|
||||
step1b[-0] = _mm_sub_epi16(ina0, inb0);
|
||||
}
|
||||
{
|
||||
const __m128i *ina = in0 + 4;
|
||||
const __m128i *inb = in1 + 11;
|
||||
__m128i *step1a = &step1[4];
|
||||
__m128i *step1b = &step1[27];
|
||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
|
||||
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
|
||||
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
|
||||
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
|
||||
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
|
||||
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
|
||||
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
|
||||
step1a[0] = _mm_add_epi16(ina0, inb0);
|
||||
step1a[1] = _mm_add_epi16(ina1, inb1);
|
||||
step1a[2] = _mm_add_epi16(ina2, inb2);
|
||||
step1a[3] = _mm_add_epi16(ina3, inb3);
|
||||
step1b[-3] = _mm_sub_epi16(ina3, inb3);
|
||||
step1b[-2] = _mm_sub_epi16(ina2, inb2);
|
||||
step1b[-1] = _mm_sub_epi16(ina1, inb1);
|
||||
step1b[-0] = _mm_sub_epi16(ina0, inb0);
|
||||
}
|
||||
{
|
||||
const __m128i *ina = in0 + 8;
|
||||
const __m128i *inb = in1 + 7;
|
||||
__m128i *step1a = &step1[8];
|
||||
__m128i *step1b = &step1[23];
|
||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
|
||||
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
|
||||
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
|
||||
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
|
||||
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
|
||||
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
|
||||
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
|
||||
step1a[0] = _mm_add_epi16(ina0, inb0);
|
||||
step1a[1] = _mm_add_epi16(ina1, inb1);
|
||||
step1a[2] = _mm_add_epi16(ina2, inb2);
|
||||
step1a[3] = _mm_add_epi16(ina3, inb3);
|
||||
step1b[-3] = _mm_sub_epi16(ina3, inb3);
|
||||
step1b[-2] = _mm_sub_epi16(ina2, inb2);
|
||||
step1b[-1] = _mm_sub_epi16(ina1, inb1);
|
||||
step1b[-0] = _mm_sub_epi16(ina0, inb0);
|
||||
}
|
||||
{
|
||||
const __m128i *ina = in0 + 12;
|
||||
const __m128i *inb = in1 + 3;
|
||||
__m128i *step1a = &step1[12];
|
||||
__m128i *step1b = &step1[19];
|
||||
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
|
||||
const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
|
||||
const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
|
||||
const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
|
||||
const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
|
||||
const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
|
||||
const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
|
||||
const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
|
||||
step1a[0] = _mm_add_epi16(ina0, inb0);
|
||||
step1a[1] = _mm_add_epi16(ina1, inb1);
|
||||
step1a[2] = _mm_add_epi16(ina2, inb2);
|
||||
step1a[3] = _mm_add_epi16(ina3, inb3);
|
||||
step1b[-3] = _mm_sub_epi16(ina3, inb3);
|
||||
step1b[-2] = _mm_sub_epi16(ina2, inb2);
|
||||
step1b[-1] = _mm_sub_epi16(ina1, inb1);
|
||||
step1b[-0] = _mm_sub_epi16(ina0, inb0);
|
||||
}
|
||||
// Stage 2
|
||||
{
|
||||
step2[0] = _mm_add_epi16(step1[0], step1[15]);
|
||||
step2[1] = _mm_add_epi16(step1[1], step1[14]);
|
||||
step2[2] = _mm_add_epi16(step1[2], step1[13]);
|
||||
step2[3] = _mm_add_epi16(step1[3], step1[12]);
|
||||
step2[4] = _mm_add_epi16(step1[4], step1[11]);
|
||||
step2[5] = _mm_add_epi16(step1[5], step1[10]);
|
||||
step2[6] = _mm_add_epi16(step1[6], step1[9]);
|
||||
step2[7] = _mm_add_epi16(step1[7], step1[8]);
|
||||
step2[8] = _mm_sub_epi16(step1[7], step1[8]);
|
||||
step2[9] = _mm_sub_epi16(step1[6], step1[9]);
|
||||
step2[10] = _mm_sub_epi16(step1[5], step1[10]);
|
||||
step2[11] = _mm_sub_epi16(step1[4], step1[11]);
|
||||
step2[12] = _mm_sub_epi16(step1[3], step1[12]);
|
||||
step2[13] = _mm_sub_epi16(step1[2], step1[13]);
|
||||
step2[14] = _mm_sub_epi16(step1[1], step1[14]);
|
||||
step2[15] = _mm_sub_epi16(step1[0], step1[15]);
|
||||
}
|
||||
{
|
||||
const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
|
||||
const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
|
||||
const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
|
||||
const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
|
||||
const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
|
||||
const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
|
||||
const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
|
||||
const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
|
||||
const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
|
||||
const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
|
||||
const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
|
||||
const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
|
||||
const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
|
||||
const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
|
||||
const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
|
||||
const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
|
||||
const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
|
||||
const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
|
||||
const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
|
||||
const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
|
||||
const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
|
||||
const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
|
||||
const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
|
||||
const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
|
||||
// dct_const_round_shift
|
||||
const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
|
||||
const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
|
||||
const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
|
||||
const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
|
||||
const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
|
||||
const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
|
||||
const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
|
||||
const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
|
||||
const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
|
||||
const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
|
||||
const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
|
||||
const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
|
||||
const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
|
||||
const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
|
||||
const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
|
||||
const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
|
||||
step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
|
||||
step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
|
||||
step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
|
||||
step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
|
||||
step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
|
||||
step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
|
||||
step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
|
||||
}
|
||||
// Stage 3
|
||||
{
|
||||
step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
|
||||
step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
|
||||
step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
|
||||
step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
|
||||
step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
|
||||
step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
|
||||
step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
|
||||
step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
|
||||
}
|
||||
{
|
||||
const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
|
||||
const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
|
||||
const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
|
||||
const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
|
||||
const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
|
||||
const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
|
||||
const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
|
||||
const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
|
||||
const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
|
||||
const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
|
||||
const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
|
||||
const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
|
||||
// dct_const_round_shift
|
||||
const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
|
||||
const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
|
||||
const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
|
||||
const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
|
||||
const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
|
||||
const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
|
||||
const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
|
||||
const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
|
||||
step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
|
||||
step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
|
||||
step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
|
||||
}
|
||||
{
|
||||
step3[16] = _mm_add_epi16(step2[23], step1[16]);
|
||||
step3[17] = _mm_add_epi16(step2[22], step1[17]);
|
||||
step3[18] = _mm_add_epi16(step2[21], step1[18]);
|
||||
step3[19] = _mm_add_epi16(step2[20], step1[19]);
|
||||
step3[20] = _mm_sub_epi16(step1[19], step2[20]);
|
||||
step3[21] = _mm_sub_epi16(step1[18], step2[21]);
|
||||
step3[22] = _mm_sub_epi16(step1[17], step2[22]);
|
||||
step3[23] = _mm_sub_epi16(step1[16], step2[23]);
|
||||
step3[24] = _mm_sub_epi16(step1[31], step2[24]);
|
||||
step3[25] = _mm_sub_epi16(step1[30], step2[25]);
|
||||
step3[26] = _mm_sub_epi16(step1[29], step2[26]);
|
||||
step3[27] = _mm_sub_epi16(step1[28], step2[27]);
|
||||
step3[28] = _mm_add_epi16(step2[27], step1[28]);
|
||||
step3[29] = _mm_add_epi16(step2[26], step1[29]);
|
||||
step3[30] = _mm_add_epi16(step2[25], step1[30]);
|
||||
step3[31] = _mm_add_epi16(step2[24], step1[31]);
|
||||
}
|
||||
|
||||
// Stage 4
|
||||
{
|
||||
step1[0] = _mm_add_epi16(step3[3], step3[0]);
|
||||
step1[1] = _mm_add_epi16(step3[2], step3[1]);
|
||||
step1[2] = _mm_sub_epi16(step3[1], step3[2]);
|
||||
step1[3] = _mm_sub_epi16(step3[0], step3[3]);
|
||||
step1[8] = _mm_add_epi16(step3[11], step2[8]);
|
||||
step1[9] = _mm_add_epi16(step3[10], step2[9]);
|
||||
step1[10] = _mm_sub_epi16(step2[9], step3[10]);
|
||||
step1[11] = _mm_sub_epi16(step2[8], step3[11]);
|
||||
step1[12] = _mm_sub_epi16(step2[15], step3[12]);
|
||||
step1[13] = _mm_sub_epi16(step2[14], step3[13]);
|
||||
step1[14] = _mm_add_epi16(step3[13], step2[14]);
|
||||
step1[15] = _mm_add_epi16(step3[12], step2[15]);
|
||||
}
|
||||
{
|
||||
const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
|
||||
const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
|
||||
const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
|
||||
const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
|
||||
const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
|
||||
const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
|
||||
// dct_const_round_shift
|
||||
const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
|
||||
const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
|
||||
const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
|
||||
const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
|
||||
step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
|
||||
}
|
||||
{
|
||||
const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
|
||||
const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
|
||||
const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
|
||||
const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
|
||||
const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
|
||||
const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
|
||||
const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
|
||||
const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
|
||||
const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
|
||||
const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
|
||||
const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
|
||||
const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
|
||||
const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
|
||||
const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
|
||||
const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
|
||||
const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
|
||||
const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
|
||||
const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
|
||||
const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
|
||||
const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
|
||||
const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
|
||||
const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
|
||||
const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
|
||||
const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
|
||||
// dct_const_round_shift
|
||||
const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
|
||||
const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
|
||||
const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
|
||||
const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
|
||||
const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
|
||||
const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
|
||||
const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
|
||||
const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
|
||||
const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
|
||||
const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
|
||||
const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
|
||||
const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
|
||||
const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
|
||||
const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
|
||||
const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
|
||||
const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
|
||||
step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
|
||||
step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
|
||||
step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
|
||||
step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
|
||||
step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
|
||||
step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
|
||||
step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
|
||||
}
|
||||
// Stage 5
|
||||
{
|
||||
step2[4] = _mm_add_epi16(step1[5], step3[4]);
|
||||
step2[5] = _mm_sub_epi16(step3[4], step1[5]);
|
||||
step2[6] = _mm_sub_epi16(step3[7], step1[6]);
|
||||
step2[7] = _mm_add_epi16(step1[6], step3[7]);
|
||||
}
|
||||
{
|
||||
const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
|
||||
const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
|
||||
const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
|
||||
const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
|
||||
const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
|
||||
const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
|
||||
const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
|
||||
const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
|
||||
const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
|
||||
const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
|
||||
const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
|
||||
const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
|
||||
// dct_const_round_shift
|
||||
const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
|
||||
const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
|
||||
const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
|
||||
const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
|
||||
const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
|
||||
const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
|
||||
const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
|
||||
const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
out[0] = _mm_packs_epi32(out_00_6, out_00_7);
|
||||
out[16] = _mm_packs_epi32(out_16_6, out_16_7);
|
||||
out[8] = _mm_packs_epi32(out_08_6, out_08_7);
|
||||
out[24] = _mm_packs_epi32(out_24_6, out_24_7);
|
||||
}
|
||||
{
|
||||
const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
|
||||
const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
|
||||
const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
|
||||
const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
|
||||
const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
|
||||
const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
|
||||
const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
|
||||
const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
|
||||
const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
|
||||
const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
|
||||
const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
|
||||
const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
|
||||
// dct_const_round_shift
|
||||
const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
|
||||
const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
|
||||
const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
|
||||
const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
|
||||
const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
|
||||
const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
|
||||
const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
|
||||
const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
|
||||
step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
|
||||
step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
|
||||
step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
|
||||
}
|
||||
{
|
||||
step2[16] = _mm_add_epi16(step1[19], step3[16]);
|
||||
step2[17] = _mm_add_epi16(step1[18], step3[17]);
|
||||
step2[18] = _mm_sub_epi16(step3[17], step1[18]);
|
||||
step2[19] = _mm_sub_epi16(step3[16], step1[19]);
|
||||
step2[20] = _mm_sub_epi16(step3[23], step1[20]);
|
||||
step2[21] = _mm_sub_epi16(step3[22], step1[21]);
|
||||
step2[22] = _mm_add_epi16(step1[21], step3[22]);
|
||||
step2[23] = _mm_add_epi16(step1[20], step3[23]);
|
||||
step2[24] = _mm_add_epi16(step1[27], step3[24]);
|
||||
step2[25] = _mm_add_epi16(step1[26], step3[25]);
|
||||
step2[26] = _mm_sub_epi16(step3[25], step1[26]);
|
||||
step2[27] = _mm_sub_epi16(step3[24], step1[27]);
|
||||
step2[28] = _mm_sub_epi16(step3[31], step1[28]);
|
||||
step2[29] = _mm_sub_epi16(step3[30], step1[29]);
|
||||
step2[30] = _mm_add_epi16(step1[29], step3[30]);
|
||||
step2[31] = _mm_add_epi16(step1[28], step3[31]);
|
||||
}
|
||||
// Stage 6
|
||||
{
|
||||
const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
|
||||
const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
|
||||
const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
|
||||
const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
|
||||
const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
|
||||
const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
|
||||
const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
|
||||
const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
|
||||
const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
|
||||
const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
|
||||
const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
|
||||
const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
|
||||
const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
|
||||
const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
|
||||
const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
|
||||
const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
|
||||
// dct_const_round_shift
|
||||
const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
|
||||
const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
|
||||
const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
|
||||
const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
|
||||
const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
|
||||
const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
|
||||
const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
|
||||
const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
out[4] = _mm_packs_epi32(out_04_6, out_04_7);
|
||||
out[20] = _mm_packs_epi32(out_20_6, out_20_7);
|
||||
out[12] = _mm_packs_epi32(out_12_6, out_12_7);
|
||||
out[28] = _mm_packs_epi32(out_28_6, out_28_7);
|
||||
}
|
||||
{
|
||||
step3[8] = _mm_add_epi16(step2[9], step1[8]);
|
||||
step3[9] = _mm_sub_epi16(step1[8], step2[9]);
|
||||
step3[10] = _mm_sub_epi16(step1[11], step2[10]);
|
||||
step3[11] = _mm_add_epi16(step2[10], step1[11]);
|
||||
step3[12] = _mm_add_epi16(step2[13], step1[12]);
|
||||
step3[13] = _mm_sub_epi16(step1[12], step2[13]);
|
||||
step3[14] = _mm_sub_epi16(step1[15], step2[14]);
|
||||
step3[15] = _mm_add_epi16(step2[14], step1[15]);
|
||||
}
|
||||
{
|
||||
const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
|
||||
const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
|
||||
const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
|
||||
const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
|
||||
const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
|
||||
const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
|
||||
const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
|
||||
const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
|
||||
const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
|
||||
const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
|
||||
const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
|
||||
const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
|
||||
const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
|
||||
const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
|
||||
const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
|
||||
const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
|
||||
const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
|
||||
const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
|
||||
const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
|
||||
const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
|
||||
const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
|
||||
const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
|
||||
const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
|
||||
const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
|
||||
// dct_const_round_shift
|
||||
const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
|
||||
const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
|
||||
const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
|
||||
const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
|
||||
const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
|
||||
const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
|
||||
const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
|
||||
const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
|
||||
const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
|
||||
const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
|
||||
const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
|
||||
const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
|
||||
const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
|
||||
const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
|
||||
const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
|
||||
const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
|
||||
step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
|
||||
step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
|
||||
step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
|
||||
// Combine
|
||||
step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
|
||||
step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
|
||||
step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
|
||||
step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
|
||||
}
|
||||
// Stage 7
|
||||
{
|
||||
const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
|
||||
const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
|
||||
const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
|
||||
const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
|
||||
const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
|
||||
const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
|
||||
const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
|
||||
const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
|
||||
const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
|
||||
const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
|
||||
const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
|
||||
const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
|
||||
const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
|
||||
const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
|
||||
const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
|
||||
const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
|
||||
const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
|
||||
const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
|
||||
const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
|
||||
const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
|
||||
const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
|
||||
const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
|
||||
const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
|
||||
const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
|
||||
// dct_const_round_shift
|
||||
const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
|
||||
const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
|
||||
const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
|
||||
const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
|
||||
const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
|
||||
const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
|
||||
const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
|
||||
const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
|
||||
const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
|
||||
const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
|
||||
const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
|
||||
const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
|
||||
const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
|
||||
const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
|
||||
const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
|
||||
const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
out[2] = _mm_packs_epi32(out_02_6, out_02_7);
|
||||
out[18] = _mm_packs_epi32(out_18_6, out_18_7);
|
||||
out[10] = _mm_packs_epi32(out_10_6, out_10_7);
|
||||
out[26] = _mm_packs_epi32(out_26_6, out_26_7);
|
||||
out[6] = _mm_packs_epi32(out_06_6, out_06_7);
|
||||
out[22] = _mm_packs_epi32(out_22_6, out_22_7);
|
||||
out[14] = _mm_packs_epi32(out_14_6, out_14_7);
|
||||
out[30] = _mm_packs_epi32(out_30_6, out_30_7);
|
||||
}
|
||||
{
|
||||
step1[16] = _mm_add_epi16(step3[17], step2[16]);
|
||||
step1[17] = _mm_sub_epi16(step2[16], step3[17]);
|
||||
step1[18] = _mm_sub_epi16(step2[19], step3[18]);
|
||||
step1[19] = _mm_add_epi16(step3[18], step2[19]);
|
||||
step1[20] = _mm_add_epi16(step3[21], step2[20]);
|
||||
step1[21] = _mm_sub_epi16(step2[20], step3[21]);
|
||||
step1[22] = _mm_sub_epi16(step2[23], step3[22]);
|
||||
step1[23] = _mm_add_epi16(step3[22], step2[23]);
|
||||
step1[24] = _mm_add_epi16(step3[25], step2[24]);
|
||||
step1[25] = _mm_sub_epi16(step2[24], step3[25]);
|
||||
step1[26] = _mm_sub_epi16(step2[27], step3[26]);
|
||||
step1[27] = _mm_add_epi16(step3[26], step2[27]);
|
||||
step1[28] = _mm_add_epi16(step3[29], step2[28]);
|
||||
step1[29] = _mm_sub_epi16(step2[28], step3[29]);
|
||||
step1[30] = _mm_sub_epi16(step2[31], step3[30]);
|
||||
step1[31] = _mm_add_epi16(step3[30], step2[31]);
|
||||
}
|
||||
// Final stage --- outputs indices are bit-reversed.
|
||||
{
|
||||
const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
|
||||
const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
|
||||
const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
|
||||
const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
|
||||
const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
|
||||
const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
|
||||
const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
|
||||
const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
|
||||
const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
|
||||
const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
|
||||
const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
|
||||
const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
|
||||
const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
|
||||
const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
|
||||
const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
|
||||
const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
|
||||
const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
|
||||
const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
|
||||
const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
|
||||
const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
|
||||
const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
|
||||
const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
|
||||
const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
|
||||
const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
|
||||
// dct_const_round_shift
|
||||
const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
|
||||
const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
|
||||
const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
|
||||
const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
|
||||
const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
|
||||
const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
|
||||
const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
|
||||
const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
|
||||
const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
|
||||
const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
|
||||
const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
|
||||
const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
|
||||
const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
|
||||
const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
|
||||
const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
|
||||
const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
out[1] = _mm_packs_epi32(out_01_6, out_01_7);
|
||||
out[17] = _mm_packs_epi32(out_17_6, out_17_7);
|
||||
out[9] = _mm_packs_epi32(out_09_6, out_09_7);
|
||||
out[25] = _mm_packs_epi32(out_25_6, out_25_7);
|
||||
out[7] = _mm_packs_epi32(out_07_6, out_07_7);
|
||||
out[23] = _mm_packs_epi32(out_23_6, out_23_7);
|
||||
out[15] = _mm_packs_epi32(out_15_6, out_15_7);
|
||||
out[31] = _mm_packs_epi32(out_31_6, out_31_7);
|
||||
}
|
||||
{
|
||||
const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
|
||||
const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
|
||||
const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
|
||||
const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
|
||||
const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
|
||||
const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
|
||||
const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
|
||||
const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
|
||||
const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
|
||||
const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
|
||||
const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
|
||||
const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
|
||||
const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
|
||||
const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
|
||||
const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
|
||||
const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
|
||||
const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
|
||||
const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
|
||||
const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
|
||||
const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
|
||||
const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
|
||||
const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
|
||||
const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
|
||||
const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
|
||||
// dct_const_round_shift
|
||||
const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
|
||||
const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
|
||||
const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
|
||||
const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
|
||||
const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
|
||||
const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
|
||||
const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
|
||||
const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
|
||||
const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
|
||||
const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
|
||||
const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
|
||||
const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
|
||||
const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
|
||||
const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
|
||||
const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
|
||||
const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
|
||||
const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
|
||||
// Combine
|
||||
out[5] = _mm_packs_epi32(out_05_6, out_05_7);
|
||||
out[21] = _mm_packs_epi32(out_21_6, out_21_7);
|
||||
out[13] = _mm_packs_epi32(out_13_6, out_13_7);
|
||||
out[29] = _mm_packs_epi32(out_29_6, out_29_7);
|
||||
out[3] = _mm_packs_epi32(out_03_6, out_03_7);
|
||||
out[19] = _mm_packs_epi32(out_19_6, out_19_7);
|
||||
out[11] = _mm_packs_epi32(out_11_6, out_11_7);
|
||||
out[27] = _mm_packs_epi32(out_27_6, out_27_7);
|
||||
}
|
||||
|
||||
// Output results
|
||||
{
|
||||
int j;
|
||||
for (j = 0; j < 16; ++j) {
|
||||
_mm_storeu_si128((__m128i *)(in0 + j), out[j]);
|
||||
_mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
|
||||
}
|
||||
}
|
||||
} // NOLINT
|
||||
@@ -1,24 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include "./aom_config.h"
|
||||
|
||||
#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2
|
||||
#define FDCT32x32_HIGH_PRECISION 0
|
||||
#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"
|
||||
#undef FDCT32x32_2D_AVX2
|
||||
#undef FDCT32x32_HIGH_PRECISION
|
||||
|
||||
#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
|
||||
#define FDCT32x32_HIGH_PRECISION 1
|
||||
#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
|
||||
#undef FDCT32x32_2D_AVX2
|
||||
#undef FDCT32x32_HIGH_PRECISION
|
||||
@@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
|
||||
#define AOM_DSP_X86_FWD_TXFM_AVX2_H
|
||||
|
||||
#include "./aom_config.h"
|
||||
|
||||
static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
|
||||
|
||||
__m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
|
||||
__m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
|
||||
|
||||
__m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
|
||||
__m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
|
||||
|
||||
_mm256_storeu_si256((__m256i *)out, y0);
|
||||
_mm256_storeu_si256((__m256i *)(out + 8), y1);
|
||||
#else
|
||||
_mm256_storeu_si256((__m256i *)out, *coeff);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H
|
||||
@@ -1,363 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred,
|
||||
ptrdiff_t pred_stride);
|
||||
|
||||
static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
__m128i u0, u1, u2, u3;
|
||||
__m128i v0, v1, v2, v3;
|
||||
__m128i x0, x1, x2, x3;
|
||||
int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
|
||||
|
||||
u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
|
||||
u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
|
||||
u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
|
||||
v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
|
||||
v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
|
||||
v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
|
||||
v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
|
||||
|
||||
x0 = _mm_sub_epi16(u0, v0);
|
||||
x1 = _mm_sub_epi16(u1, v1);
|
||||
x2 = _mm_sub_epi16(u2, v2);
|
||||
x3 = _mm_sub_epi16(u3, v3);
|
||||
|
||||
_mm_storel_epi64((__m128i *)store_diff, x0);
|
||||
store_diff = (int64_t *)(diff + 1 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x1);
|
||||
store_diff = (int64_t *)(diff + 2 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x2);
|
||||
store_diff = (int64_t *)(diff + 3 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x3);
|
||||
}
|
||||
|
||||
static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
|
||||
|
||||
u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
|
||||
u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
|
||||
u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
|
||||
u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
|
||||
u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
|
||||
u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
|
||||
|
||||
v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
|
||||
v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
|
||||
v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
|
||||
v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
|
||||
v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
|
||||
v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
|
||||
v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
|
||||
v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
|
||||
|
||||
x0 = _mm_sub_epi16(u0, v0);
|
||||
x1 = _mm_sub_epi16(u1, v1);
|
||||
x2 = _mm_sub_epi16(u2, v2);
|
||||
x3 = _mm_sub_epi16(u3, v3);
|
||||
x4 = _mm_sub_epi16(u4, v4);
|
||||
x5 = _mm_sub_epi16(u5, v5);
|
||||
x6 = _mm_sub_epi16(u6, v6);
|
||||
x7 = _mm_sub_epi16(u7, v7);
|
||||
|
||||
_mm_storel_epi64((__m128i *)store_diff, x0);
|
||||
store_diff = (int64_t *)(diff + 1 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x1);
|
||||
store_diff = (int64_t *)(diff + 2 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x2);
|
||||
store_diff = (int64_t *)(diff + 3 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x3);
|
||||
store_diff = (int64_t *)(diff + 4 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x4);
|
||||
store_diff = (int64_t *)(diff + 5 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x5);
|
||||
store_diff = (int64_t *)(diff + 6 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x6);
|
||||
store_diff = (int64_t *)(diff + 7 * diff_stride);
|
||||
_mm_storel_epi64((__m128i *)store_diff, x7);
|
||||
}
|
||||
|
||||
static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
__m128i u0, u1, u2, u3;
|
||||
__m128i v0, v1, v2, v3;
|
||||
__m128i x0, x1, x2, x3;
|
||||
|
||||
u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
|
||||
u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
|
||||
u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
|
||||
v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
|
||||
v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
|
||||
v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
|
||||
v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
|
||||
|
||||
x0 = _mm_sub_epi16(u0, v0);
|
||||
x1 = _mm_sub_epi16(u1, v1);
|
||||
x2 = _mm_sub_epi16(u2, v2);
|
||||
x3 = _mm_sub_epi16(u3, v3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
|
||||
_mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
|
||||
_mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
|
||||
_mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
|
||||
}
|
||||
|
||||
static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
|
||||
u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
|
||||
u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
|
||||
u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
|
||||
u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
|
||||
u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
|
||||
|
||||
v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
|
||||
v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
|
||||
v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
|
||||
v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
|
||||
v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
|
||||
v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
|
||||
v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
|
||||
v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
|
||||
|
||||
x0 = _mm_sub_epi16(u0, v0);
|
||||
x1 = _mm_sub_epi16(u1, v1);
|
||||
x2 = _mm_sub_epi16(u2, v2);
|
||||
x3 = _mm_sub_epi16(u3, v3);
|
||||
x4 = _mm_sub_epi16(u4, v4);
|
||||
x5 = _mm_sub_epi16(u5, v5);
|
||||
x6 = _mm_sub_epi16(u6, v6);
|
||||
x7 = _mm_sub_epi16(u7, v7);
|
||||
|
||||
_mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
|
||||
_mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
|
||||
_mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
|
||||
_mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
|
||||
_mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
|
||||
_mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
|
||||
_mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
|
||||
_mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
|
||||
}
|
||||
|
||||
static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 3;
|
||||
src += src_stride << 3;
|
||||
pred += pred_stride << 3;
|
||||
subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += 8;
|
||||
src += 8;
|
||||
pred += 8;
|
||||
subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 3;
|
||||
src += src_stride << 3;
|
||||
pred += pred_stride << 3;
|
||||
subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 4;
|
||||
src += src_stride << 4;
|
||||
pred += pred_stride << 4;
|
||||
subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += 16;
|
||||
src += 16;
|
||||
pred += 16;
|
||||
subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 4;
|
||||
src += src_stride << 4;
|
||||
pred += pred_stride << 4;
|
||||
subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 5;
|
||||
src += src_stride << 5;
|
||||
pred += pred_stride << 5;
|
||||
subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += 32;
|
||||
src += 32;
|
||||
pred += 32;
|
||||
subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 5;
|
||||
src += src_stride << 5;
|
||||
pred += pred_stride << 5;
|
||||
subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 6;
|
||||
src += src_stride << 6;
|
||||
pred += pred_stride << 6;
|
||||
subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += 64;
|
||||
src += 64;
|
||||
pred += 64;
|
||||
subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
|
||||
const uint16_t *src, ptrdiff_t src_stride,
|
||||
const uint16_t *pred, ptrdiff_t pred_stride) {
|
||||
subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
diff += diff_stride << 6;
|
||||
src += src_stride << 6;
|
||||
pred += pred_stride << 6;
|
||||
subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
|
||||
static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
|
||||
SubtractWxHFuncType ret_func_ptr = NULL;
|
||||
if (rows == 4) {
|
||||
if (cols == 4) {
|
||||
ret_func_ptr = subtract_4x4;
|
||||
} else if (cols == 8) {
|
||||
ret_func_ptr = subtract_8x4;
|
||||
}
|
||||
} else if (rows == 8) {
|
||||
if (cols == 4) {
|
||||
ret_func_ptr = subtract_4x8;
|
||||
} else if (cols == 8) {
|
||||
ret_func_ptr = subtract_8x8;
|
||||
} else if (cols == 16) {
|
||||
ret_func_ptr = subtract_16x8;
|
||||
}
|
||||
} else if (rows == 16) {
|
||||
if (cols == 8) {
|
||||
ret_func_ptr = subtract_8x16;
|
||||
} else if (cols == 16) {
|
||||
ret_func_ptr = subtract_16x16;
|
||||
} else if (cols == 32) {
|
||||
ret_func_ptr = subtract_32x16;
|
||||
}
|
||||
} else if (rows == 32) {
|
||||
if (cols == 16) {
|
||||
ret_func_ptr = subtract_16x32;
|
||||
} else if (cols == 32) {
|
||||
ret_func_ptr = subtract_32x32;
|
||||
} else if (cols == 64) {
|
||||
ret_func_ptr = subtract_64x32;
|
||||
}
|
||||
} else if (rows == 64) {
|
||||
if (cols == 32) {
|
||||
ret_func_ptr = subtract_32x64;
|
||||
} else if (cols == 64) {
|
||||
ret_func_ptr = subtract_64x64;
|
||||
} else if (cols == 128) {
|
||||
ret_func_ptr = subtract_128x64;
|
||||
}
|
||||
} else if (rows == 128) {
|
||||
if (cols == 64) {
|
||||
ret_func_ptr = subtract_64x128;
|
||||
} else if (cols == 128) {
|
||||
ret_func_ptr = subtract_128x128;
|
||||
}
|
||||
}
|
||||
if (!ret_func_ptr) {
|
||||
assert(0);
|
||||
}
|
||||
return ret_func_ptr;
|
||||
}
|
||||
|
||||
void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
|
||||
ptrdiff_t diff_stride, const uint8_t *src8,
|
||||
ptrdiff_t src_stride, const uint8_t *pred8,
|
||||
ptrdiff_t pred_stride, int bd) {
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
|
||||
SubtractWxHFuncType func;
|
||||
(void)bd;
|
||||
|
||||
func = getSubtractFunc(rows, cols);
|
||||
func(diff, diff_stride, src, src_stride, pred, pred_stride);
|
||||
}
|
||||
@@ -1,215 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <smmintrin.h> /* SSE4.1 */
|
||||
|
||||
#include "./aom_config.h"
|
||||
#include "./aom_dsp_rtcd.h"
|
||||
|
||||
#include "aom_dsp/variance.h"
|
||||
#include "aom_dsp/aom_filter.h"
|
||||
|
||||
static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
uint64_t *sse, int64_t *sum) {
|
||||
__m128i u0, u1, u2, u3;
|
||||
__m128i s0, s1, s2, s3;
|
||||
__m128i t0, t1, x0, y0;
|
||||
__m128i a0, a1, a2, a3;
|
||||
__m128i b0, b1, b2, b3;
|
||||
__m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
|
||||
|
||||
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||
|
||||
a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
|
||||
a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
|
||||
a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
|
||||
a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
|
||||
|
||||
b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
|
||||
b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
|
||||
b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
|
||||
b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
|
||||
|
||||
u0 = _mm_unpacklo_epi16(a0, a1);
|
||||
u1 = _mm_unpacklo_epi16(a2, a3);
|
||||
u2 = _mm_unpacklo_epi16(b0, b1);
|
||||
u3 = _mm_unpacklo_epi16(b2, b3);
|
||||
|
||||
s0 = _mm_sub_epi16(u0, u2);
|
||||
s1 = _mm_sub_epi16(u1, u3);
|
||||
|
||||
t0 = _mm_madd_epi16(s0, k_one_epi16);
|
||||
t1 = _mm_madd_epi16(s1, k_one_epi16);
|
||||
|
||||
s2 = _mm_hadd_epi32(t0, t1);
|
||||
s3 = _mm_hadd_epi32(s2, s2);
|
||||
y0 = _mm_hadd_epi32(s3, s3);
|
||||
|
||||
t0 = _mm_madd_epi16(s0, s0);
|
||||
t1 = _mm_madd_epi16(s1, s1);
|
||||
|
||||
s2 = _mm_hadd_epi32(t0, t1);
|
||||
s3 = _mm_hadd_epi32(s2, s2);
|
||||
x0 = _mm_hadd_epi32(s3, s3);
|
||||
|
||||
*sse = (uint64_t)_mm_extract_epi32(x0, 0);
|
||||
*sum = (int64_t)_mm_extract_epi32(y0, 0);
|
||||
}
|
||||
|
||||
uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int64_t sum, diff;
|
||||
uint64_t local_sse;
|
||||
|
||||
variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
|
||||
*sse = (uint32_t)local_sse;
|
||||
|
||||
diff = (int64_t)*sse - ((sum * sum) >> 4);
|
||||
return (diff >= 0) ? (uint32_t)diff : 0;
|
||||
}
|
||||
|
||||
uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int64_t sum, diff;
|
||||
uint64_t local_sse;
|
||||
|
||||
variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
|
||||
*sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
|
||||
sum = ROUND_POWER_OF_TWO(sum, 2);
|
||||
|
||||
diff = (int64_t)*sse - ((sum * sum) >> 4);
|
||||
return (diff >= 0) ? (uint32_t)diff : 0;
|
||||
}
|
||||
|
||||
uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
|
||||
const uint8_t *b, int b_stride,
|
||||
uint32_t *sse) {
|
||||
int64_t sum, diff;
|
||||
uint64_t local_sse;
|
||||
|
||||
variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
|
||||
*sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
|
||||
sum = ROUND_POWER_OF_TWO(sum, 4);
|
||||
|
||||
diff = (int64_t)*sse - ((sum * sum) >> 4);
|
||||
return diff >= 0 ? (uint32_t)diff : 0;
|
||||
}
|
||||
|
||||
// Sub-pixel
|
||||
uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset,
|
||||
const uint8_t *dst, int dst_stride, uint32_t *sse) {
|
||||
uint16_t fdata3[(4 + 1) * 4];
|
||||
uint16_t temp2[4 * 4];
|
||||
|
||||
aom_highbd_var_filter_block2d_bil_first_pass(
|
||||
src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
|
||||
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
|
||||
bilinear_filters_2t[yoffset]);
|
||||
|
||||
return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
|
||||
sse);
|
||||
}
|
||||
|
||||
uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset,
|
||||
const uint8_t *dst, int dst_stride, uint32_t *sse) {
|
||||
uint16_t fdata3[(4 + 1) * 4];
|
||||
uint16_t temp2[4 * 4];
|
||||
|
||||
aom_highbd_var_filter_block2d_bil_first_pass(
|
||||
src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
|
||||
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
|
||||
bilinear_filters_2t[yoffset]);
|
||||
|
||||
return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
|
||||
dst_stride, sse);
|
||||
}
|
||||
|
||||
uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset,
|
||||
const uint8_t *dst, int dst_stride, uint32_t *sse) {
|
||||
uint16_t fdata3[(4 + 1) * 4];
|
||||
uint16_t temp2[4 * 4];
|
||||
|
||||
aom_highbd_var_filter_block2d_bil_first_pass(
|
||||
src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
|
||||
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
|
||||
bilinear_filters_2t[yoffset]);
|
||||
|
||||
return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
|
||||
dst_stride, sse);
|
||||
}
|
||||
|
||||
// Sub-pixel average
|
||||
|
||||
uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset,
|
||||
const uint8_t *dst, int dst_stride, uint32_t *sse,
|
||||
const uint8_t *second_pred) {
|
||||
uint16_t fdata3[(4 + 1) * 4];
|
||||
uint16_t temp2[4 * 4];
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
|
||||
|
||||
aom_highbd_var_filter_block2d_bil_first_pass(
|
||||
src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
|
||||
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
|
||||
bilinear_filters_2t[yoffset]);
|
||||
|
||||
aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
|
||||
4);
|
||||
|
||||
return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
|
||||
sse);
|
||||
}
|
||||
|
||||
uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset,
|
||||
const uint8_t *dst, int dst_stride, uint32_t *sse,
|
||||
const uint8_t *second_pred) {
|
||||
uint16_t fdata3[(4 + 1) * 4];
|
||||
uint16_t temp2[4 * 4];
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
|
||||
|
||||
aom_highbd_var_filter_block2d_bil_first_pass(
|
||||
src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
|
||||
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
|
||||
bilinear_filters_2t[yoffset]);
|
||||
|
||||
aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
|
||||
4);
|
||||
|
||||
return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
|
||||
dst_stride, sse);
|
||||
}
|
||||
|
||||
uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset,
|
||||
const uint8_t *dst, int dst_stride, uint32_t *sse,
|
||||
const uint8_t *second_pred) {
|
||||
uint16_t fdata3[(4 + 1) * 4];
|
||||
uint16_t temp2[4 * 4];
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
|
||||
|
||||
aom_highbd_var_filter_block2d_bil_first_pass(
|
||||
src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
|
||||
aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
|
||||
bilinear_filters_2t[yoffset]);
|
||||
|
||||
aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
|
||||
4);
|
||||
|
||||
return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
|
||||
dst_stride, sse);
|
||||
}
|
||||
@@ -1,333 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#include "aom_ports/mem.h"
|
||||
#include "./aom_config.h"
|
||||
#include "aom/aom_integer.h"
|
||||
|
||||
static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
|
||||
__m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
|
||||
__m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
|
||||
return _mm_unpacklo_epi64(temp1, temp2);
|
||||
}
|
||||
|
||||
static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
|
||||
__m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
|
||||
__m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
|
||||
__m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
|
||||
temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
|
||||
temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
|
||||
temp1 = _mm_unpacklo_epi32(temp1, temp2);
|
||||
return _mm_unpacklo_epi64(temp3, temp1);
|
||||
}
|
||||
|
||||
static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
|
||||
const uint8_t *b_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride,
|
||||
int width, int height);
|
||||
|
||||
static INLINE unsigned int masked_sad8xh_ssse3(
|
||||
const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int height);
|
||||
|
||||
static INLINE unsigned int masked_sad4xh_ssse3(
|
||||
const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int height);
|
||||
|
||||
#define MASKSADMXN_SSSE3(m, n) \
|
||||
unsigned int aom_masked_sad##m##x##n##_ssse3( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *msk, int msk_stride) { \
|
||||
return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
|
||||
m, n); \
|
||||
}
|
||||
|
||||
#if CONFIG_EXT_PARTITION
|
||||
MASKSADMXN_SSSE3(128, 128)
|
||||
MASKSADMXN_SSSE3(128, 64)
|
||||
MASKSADMXN_SSSE3(64, 128)
|
||||
#endif // CONFIG_EXT_PARTITION
|
||||
MASKSADMXN_SSSE3(64, 64)
|
||||
MASKSADMXN_SSSE3(64, 32)
|
||||
MASKSADMXN_SSSE3(32, 64)
|
||||
MASKSADMXN_SSSE3(32, 32)
|
||||
MASKSADMXN_SSSE3(32, 16)
|
||||
MASKSADMXN_SSSE3(16, 32)
|
||||
MASKSADMXN_SSSE3(16, 16)
|
||||
MASKSADMXN_SSSE3(16, 8)
|
||||
|
||||
#define MASKSAD8XN_SSSE3(n) \
|
||||
unsigned int aom_masked_sad8x##n##_ssse3( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *msk, int msk_stride) { \
|
||||
return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
|
||||
msk_stride, n); \
|
||||
}
|
||||
|
||||
MASKSAD8XN_SSSE3(16)
|
||||
MASKSAD8XN_SSSE3(8)
|
||||
MASKSAD8XN_SSSE3(4)
|
||||
|
||||
#define MASKSAD4XN_SSSE3(n) \
|
||||
unsigned int aom_masked_sad4x##n##_ssse3( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *msk, int msk_stride) { \
|
||||
return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
|
||||
msk_stride, n); \
|
||||
}
|
||||
|
||||
MASKSAD4XN_SSSE3(8)
|
||||
MASKSAD4XN_SSSE3(4)
|
||||
|
||||
// For width a multiple of 16
|
||||
// Assumes values in m are <=64
|
||||
static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
|
||||
const uint8_t *b_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride,
|
||||
int width, int height) {
|
||||
int y, x;
|
||||
__m128i a, b, m, temp1, temp2;
|
||||
__m128i res = _mm_setzero_si128();
|
||||
__m128i one = _mm_set1_epi16(1);
|
||||
// For each row
|
||||
for (y = 0; y < height; y++) {
|
||||
// Covering the full width
|
||||
for (x = 0; x < width; x += 16) {
|
||||
// Load a, b, m in xmm registers
|
||||
a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
|
||||
b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
|
||||
m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
|
||||
|
||||
// Calculate the difference between a & b
|
||||
temp1 = _mm_subs_epu8(a, b);
|
||||
temp2 = _mm_subs_epu8(b, a);
|
||||
temp1 = _mm_or_si128(temp1, temp2);
|
||||
|
||||
// Multiply by m and add together
|
||||
temp2 = _mm_maddubs_epi16(temp1, m);
|
||||
// Pad out row result to 32 bit integers & add to running total
|
||||
res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
|
||||
}
|
||||
// Move onto the next row
|
||||
a_ptr += a_stride;
|
||||
b_ptr += b_stride;
|
||||
m_ptr += m_stride;
|
||||
}
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
// sad = (sad + 31) >> 6;
|
||||
return (_mm_cvtsi128_si32(res) + 31) >> 6;
|
||||
}
|
||||
|
||||
static INLINE unsigned int masked_sad8xh_ssse3(
|
||||
const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int height) {
|
||||
int y;
|
||||
__m128i a, b, m, temp1, temp2, row_res;
|
||||
__m128i res = _mm_setzero_si128();
|
||||
__m128i one = _mm_set1_epi16(1);
|
||||
// Add the masked SAD for 2 rows at a time
|
||||
for (y = 0; y < height; y += 2) {
|
||||
// Load a, b, m in xmm registers
|
||||
a = width8_load_2rows(a_ptr, a_stride);
|
||||
b = width8_load_2rows(b_ptr, b_stride);
|
||||
m = width8_load_2rows(m_ptr, m_stride);
|
||||
|
||||
// Calculate the difference between a & b
|
||||
temp1 = _mm_subs_epu8(a, b);
|
||||
temp2 = _mm_subs_epu8(b, a);
|
||||
temp1 = _mm_or_si128(temp1, temp2);
|
||||
|
||||
// Multiply by m and add together
|
||||
row_res = _mm_maddubs_epi16(temp1, m);
|
||||
|
||||
// Pad out row result to 32 bit integers & add to running total
|
||||
res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
|
||||
|
||||
// Move onto the next rows
|
||||
a_ptr += a_stride * 2;
|
||||
b_ptr += b_stride * 2;
|
||||
m_ptr += m_stride * 2;
|
||||
}
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
// sad = (sad + 31) >> 6;
|
||||
return (_mm_cvtsi128_si32(res) + 31) >> 6;
|
||||
}
|
||||
|
||||
static INLINE unsigned int masked_sad4xh_ssse3(
|
||||
const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int height) {
|
||||
int y;
|
||||
__m128i a, b, m, temp1, temp2, row_res;
|
||||
__m128i res = _mm_setzero_si128();
|
||||
__m128i one = _mm_set1_epi16(1);
|
||||
// Add the masked SAD for 4 rows at a time
|
||||
for (y = 0; y < height; y += 4) {
|
||||
// Load a, b, m in xmm registers
|
||||
a = width4_load_4rows(a_ptr, a_stride);
|
||||
b = width4_load_4rows(b_ptr, b_stride);
|
||||
m = width4_load_4rows(m_ptr, m_stride);
|
||||
|
||||
// Calculate the difference between a & b
|
||||
temp1 = _mm_subs_epu8(a, b);
|
||||
temp2 = _mm_subs_epu8(b, a);
|
||||
temp1 = _mm_or_si128(temp1, temp2);
|
||||
|
||||
// Multiply by m and add together
|
||||
row_res = _mm_maddubs_epi16(temp1, m);
|
||||
|
||||
// Pad out row result to 32 bit integers & add to running total
|
||||
res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
|
||||
|
||||
// Move onto the next rows
|
||||
a_ptr += a_stride * 4;
|
||||
b_ptr += b_stride * 4;
|
||||
m_ptr += m_stride * 4;
|
||||
}
|
||||
// Pad out row result to 32 bit integers & add to running total
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
// sad = (sad + 31) >> 6;
|
||||
return (_mm_cvtsi128_si32(res) + 31) >> 6;
|
||||
}
|
||||
|
||||
#if CONFIG_AOM_HIGHBITDEPTH
|
||||
static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
|
||||
int stride) {
|
||||
__m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
|
||||
__m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
|
||||
return _mm_unpacklo_epi64(temp1, temp2);
|
||||
}
|
||||
|
||||
static INLINE unsigned int highbd_masked_sad_ssse3(
|
||||
const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int width, int height);
|
||||
|
||||
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
|
||||
const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int height);
|
||||
|
||||
#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
|
||||
unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *msk, int msk_stride) { \
|
||||
return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
|
||||
msk_stride, m, n); \
|
||||
}
|
||||
|
||||
#if CONFIG_EXT_PARTITION
|
||||
HIGHBD_MASKSADMXN_SSSE3(128, 128)
|
||||
HIGHBD_MASKSADMXN_SSSE3(128, 64)
|
||||
HIGHBD_MASKSADMXN_SSSE3(64, 128)
|
||||
#endif // CONFIG_EXT_PARTITION
|
||||
HIGHBD_MASKSADMXN_SSSE3(64, 64)
|
||||
HIGHBD_MASKSADMXN_SSSE3(64, 32)
|
||||
HIGHBD_MASKSADMXN_SSSE3(32, 64)
|
||||
HIGHBD_MASKSADMXN_SSSE3(32, 32)
|
||||
HIGHBD_MASKSADMXN_SSSE3(32, 16)
|
||||
HIGHBD_MASKSADMXN_SSSE3(16, 32)
|
||||
HIGHBD_MASKSADMXN_SSSE3(16, 16)
|
||||
HIGHBD_MASKSADMXN_SSSE3(16, 8)
|
||||
HIGHBD_MASKSADMXN_SSSE3(8, 16)
|
||||
HIGHBD_MASKSADMXN_SSSE3(8, 8)
|
||||
HIGHBD_MASKSADMXN_SSSE3(8, 4)
|
||||
|
||||
#define HIGHBD_MASKSAD4XN_SSSE3(n) \
|
||||
unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *msk, int msk_stride) { \
|
||||
return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
|
||||
msk_stride, n); \
|
||||
}
|
||||
|
||||
HIGHBD_MASKSAD4XN_SSSE3(8)
|
||||
HIGHBD_MASKSAD4XN_SSSE3(4)
|
||||
|
||||
// For width a multiple of 8
|
||||
// Assumes values in m are <=64
|
||||
static INLINE unsigned int highbd_masked_sad_ssse3(
|
||||
const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int width, int height) {
|
||||
int y, x;
|
||||
__m128i a, b, m, temp1, temp2;
|
||||
const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
|
||||
const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
|
||||
__m128i res = _mm_setzero_si128();
|
||||
// For each row
|
||||
for (y = 0; y < height; y++) {
|
||||
// Covering the full width
|
||||
for (x = 0; x < width; x += 8) {
|
||||
// Load a, b, m in xmm registers
|
||||
a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
|
||||
b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
|
||||
m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
|
||||
_mm_setzero_si128());
|
||||
|
||||
// Calculate the difference between a & b
|
||||
temp1 = _mm_subs_epu16(a, b);
|
||||
temp2 = _mm_subs_epu16(b, a);
|
||||
temp1 = _mm_or_si128(temp1, temp2);
|
||||
|
||||
// Add result of multiplying by m and add pairs together to running total
|
||||
res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
|
||||
}
|
||||
// Move onto the next row
|
||||
a_ptr += a_stride;
|
||||
b_ptr += b_stride;
|
||||
m_ptr += m_stride;
|
||||
}
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
// sad = (sad + 31) >> 6;
|
||||
return (_mm_cvtsi128_si32(res) + 31) >> 6;
|
||||
}
|
||||
|
||||
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
|
||||
const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
|
||||
const uint8_t *m_ptr, int m_stride, int height) {
|
||||
int y;
|
||||
__m128i a, b, m, temp1, temp2;
|
||||
const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
|
||||
const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
|
||||
__m128i res = _mm_setzero_si128();
|
||||
// Add the masked SAD for 2 rows at a time
|
||||
for (y = 0; y < height; y += 2) {
|
||||
// Load a, b, m in xmm registers
|
||||
a = highbd_width4_load_2rows(a_ptr, a_stride);
|
||||
b = highbd_width4_load_2rows(b_ptr, b_stride);
|
||||
temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
|
||||
temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
|
||||
m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
|
||||
_mm_setzero_si128());
|
||||
|
||||
// Calculate the difference between a & b
|
||||
temp1 = _mm_subs_epu16(a, b);
|
||||
temp2 = _mm_subs_epu16(b, a);
|
||||
temp1 = _mm_or_si128(temp1, temp2);
|
||||
|
||||
// Multiply by m and add together
|
||||
res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
|
||||
|
||||
// Move onto the next rows
|
||||
a_ptr += a_stride * 2;
|
||||
b_ptr += b_stride * 2;
|
||||
m_ptr += m_stride * 2;
|
||||
}
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
res = _mm_hadd_epi32(res, _mm_setzero_si128());
|
||||
// sad = (sad + 31) >> 6;
|
||||
return (_mm_cvtsi128_si32(res) + 31) >> 6;
|
||||
}
|
||||
#endif // CONFIG_AOM_HIGHBITDEPTH
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user