Compare commits

..

141 Commits

Author SHA1 Message Date
James Zern
7e198abb48 update ChangeLog
Change-Id: I3bd7347337323f727a979dccadddfd22f56ec1c2
2011-09-22 13:51:57 -07:00
Pascal Massimino
dfc9c1eaef Harmonize the dates
Change-Id: I6c6f4766bc217ab0bd2fa6520ce9e2c1cd742b44
2011-09-22 13:50:10 -07:00
Pascal Massimino
28ad70c56d Fix PNG decoding bug
some png with palette-based alpha channels were incorrectly
treated as alpha-less, causing segfault.

Change-Id: I03883590f9539848d5135d359b6f105d2b14329f
2011-09-20 07:59:43 -07:00
James Zern
846e93c5d1 Update AUTHORS & add .mailmap
Change-Id: I2dd07820cae50c85695da7c40b05780ffeef434a
2011-09-16 14:33:38 -07:00
James Zern
563e52d62e cosmetics after '76036f5 Refactor decoder library'
reference 'VP8X' consistently / spelling
sort header size defines alphabetically on their logical boundaries
drop useless cast
const updates

Change-Id: I9409814c5b89773ae8910dc74887f91435e6ac82
2011-09-16 12:32:03 -07:00
Urvang Joshi
76036f547e Refactor decoder library
- Move common defines to dec/webpi.h
 - Regularize naming and parameters of various "CheckAndSkip" functions.
   Also they return VP8StatusCode for clarity.
 - Move WebP header/chunk parsing functions to webpi.h
 - Fix a bug in static function GetFeatures()

Change-Id: Ibc3da4ba210d860551724dc40741959750d66b89
2011-09-16 10:55:24 -07:00
James Zern
377ef43c3e configure.ac: update AC_INIT params
change package name to libwebp, correct version and add bug & project
urls

Change-Id: I3b980ab8088729f90794ffd1ac7b33383983c812
2011-09-13 18:37:11 -07:00
Pascal Massimino
7a8d8762c1 use a user-visible MACRO for max width/height.
This has been pointed as a useful information to have in the header (for
the non VP8-specs savvy ones)

Change-Id: I494b1da41dfafce882a94e3677d1cd6206bc504b
2011-09-13 16:07:30 -07:00
Somnath Banerjee
d4e9f5598d NEON decode support in WebP
Change-Id: I0d6fa456ca68468353adcd64669f1737d1446f65
2011-09-13 16:00:47 -07:00
James Zern
0ee683b593 update libtool version-info
consistent with:
  http://www.gnu.org/software/libtool/manual/libtool.html#Versioning

current=2 matches the ABI version for both encode & decode

Change-Id: Ie175b2bdb8f5e14690f4cf6357fc0e0c6c78fe7d
2011-09-13 15:49:13 -07:00
James Zern
fdbe02c581 windows: match _cond_destroy logic w/return variable name
CloseHandle returns non-zero on success so earlier versions would leave
'ok' with a misleading value, though the return itself was correct.

Change-Id: I21b74a59d90f7bf1b484a55f3960962e933f577b
2011-09-13 15:48:33 -07:00
James Zern
206b686b39 README: correct advanced decode api pseudo-code
Change-Id: I09e5365cc15cb9b6c53a1d5d4e16a51bfc353b10
2011-09-13 15:47:53 -07:00
Pascal Massimino
6a32a0f5bf make VP8BitReader a typedef, for better re-use
Change-Id: Id91f8c5649f9fd078facc9f280a314377193b5e8
2011-09-13 15:47:24 -07:00
Pascal Massimino
b112e83647 create a libwebputils under src/utils
with bit_reader bit_writer and thread for now.

Change-Id: If961933fcfc43e60220913fe4d527230ba8f46bb
2011-09-13 15:34:15 -07:00
Pascal Massimino
ee697d9fc9 harmonize the include guards and #endif comments 2011-09-13 15:31:52 -07:00
Somnath Banerjee
a1ec07a618 Fixing compiler error in non x86 arch.
Compiler is not getting the definition of NULL.

Change-Id: I521a99c715bb43e633abd4a26d73ad25bbbafc94
2011-09-13 15:27:58 -07:00
Somnath Banerjee
dcfa509a28 Fixed recursive inclusion of bit_writer.h and vp8enci.h.
Was getting compiler error when I included bit_writer.h from non libwebp
directory. bit_writer.h includes vp8enci.h and that uses VP8BitWriter without
having it's definition.

Change-Id: I1ca82594292979b9eb7e60e2fffb22c16768dd30
2011-09-13 15:26:39 -07:00
Pascal Massimino
e06ac0887f create a separate libwebpdsp under src/dsp
Gathers all DSP-related function (and SSE2 implementations).
Clean-up some unwanted symbolic dependencies so that webp_encode,
webp_decode and webp_dsp are truly independent libraries.

+ opportunistic clean-up:
  * remove unneeded VP8DspInitTables(), now integrated in VP8DspInit()
  * make consistent use of VP8GetCPUInfo() in the various DspInit() funcs
  * change OUT macro to DST
2011-09-13 12:29:44 -07:00
James Zern
ebeb412aa5 use unsigned int for bitfields
uint8_t is a gcc extension which msvc similarly supports, but for
greater compatibility, and to match the change already made in
dec/vp8i.h, update the remaining bitfield to use unsigned int.

Change-Id: Id9dca470345871e00e82893255a306dfe5d3fa29
2011-09-13 12:19:00 -07:00
Pascal Massimino
341cc56a8c make kNewRange a static array
Change-Id: I78096e2a28e6f732e13df6bde790a1266053838c
2011-09-13 11:51:32 -07:00
James Zern
227a91e522 README: minor wording update
Change-Id: I2683f9d8a00a476ccb90bef911804378ec6a5558
2011-08-30 17:27:46 -07:00
James Zern
05bd8e6af5 add man pages to dist
converted to text & html and stored in doc/

Change-Id: I42572bc6f93ffe5270f3ff1b13ca150292fcede4
2011-08-30 14:31:00 -07:00
Pascal Massimino
812dfa1ae1 bump up versions in preparations for 0.1.3
Change-Id: Icbadada515a5be7810473ff530565c2cb4fb904d
2011-08-29 22:52:47 -07:00
Pascal Massimino
a5b78c81b5 wrap alpha-related options under WEBP_EXPERIMENTAL_FEATURES flag
Change-Id: I6d8f4b4ec7dca53054e28b969a536e7c93c1e5e2
2011-08-29 22:04:46 -07:00
James Zern
34dc790798 regen ChangeLog for 0.1.3-rc2
update AUTHORS as well

Change-Id: If24bb6366ed7458c21141566f951cf0e86ba3398
2011-08-29 18:42:53 -07:00
James Zern
7c43663036 Silence some (more) Visual Studio warnings.
Change-Id: Idac44feac894ab13630e032222c4744d6fa785df
2011-08-29 18:08:26 -07:00
James Zern
60306e8cf3 add top-level gitattributes
ignore .gitignore/.gitattributes when using git-archive

Change-Id: I14f5d0c756298fc48613859c93d16586dfb34855
2011-08-29 17:47:13 -07:00
Mikolaj Zalewski
2aa6b80efe Slience some Visual Studio warnings.
Change-Id: I62078af80bfcaa82bdc165fc2fc8fce2d2aad862
2011-08-29 10:37:13 +02:00
Pascal Massimino
4cbbb2901b Merge "bump up version for next freeze" 2011-08-26 16:24:23 -07:00
Pascal Massimino
a329167426 bump up version for next freeze 2011-08-26 16:23:14 -07:00
James Zern
c7e86abab6 cosmetics: fix comment line lengths
add additional '-' to //----... style comments globally instead of
polluting further commits

Change-Id: I951acc68b7b5384b4d6e235349b0067d1aa6fa8b
2011-08-26 12:19:33 -07:00
James Zern
c9e037ab3e makefile.unix: add simple dist target
Change-Id: I31647a97f5892b2dbdc9f4555ea38e2824ac3788
2011-08-25 21:33:28 -04:00
James Zern
87d58ce9cd makefile.unix: rule maintenance
- use built-in variables where appropriate
- fix some target dependencies

Change-Id: I33584d0e7d9c5341841da89b6aa8f02e1c7c6aea
2011-08-25 20:36:04 -04:00
Vikas Arora
d477de7759 mend 2011-08-25 10:06:50 +05:30
Vikas Arora
fac15ec78e Update NEWS & README for next release V0.1.3
Update ChangeLog, NEWS & README file for next release V0.1.3
2011-08-25 09:11:08 +05:30
Pascal Massimino
6215595cd1 Merge "add a -partition_limit option to limit the number of bits used by intra4x4" 2011-08-23 16:09:10 -07:00
Pascal Massimino
3814b76c48 Merge "reorganize chunk-parsing code" 2011-08-23 16:03:05 -07:00
Pascal Massimino
900286e091 add a -partition_limit option to limit the number of bits used by intra4x4
Although it degrades quality, this option is useful to avoid the 512k
limit for partition #0.
If not enough to reach the lower bound of 4bits per macroblock header,
one should also limit the number of segments used (down to -segments 1)

See the man file for extra details.

Change-Id: Ia59ffac13176c85b809ddd6340d37b54ee9487ea
2011-08-23 15:58:22 -07:00
Pascal Massimino
cd12b4b0ac add the missing cost for I4/I16 mode selection
was missing from the RD-computation of intra-4x4 score.
Doesn't change anything significantly, it's just More Correct.

Change-Id: I25c5b53a810d97e6fb7f98c549fd23bbe55e1bf4
2011-08-18 20:48:06 -07:00
Pascal Massimino
dfcc2136e1 reorganize chunk-parsing code
make room for future VP8X extensions

Change-Id: Ic78217c26f142403b733740b17980aa81602f83d
2011-08-18 11:07:13 -07:00
Pascal Massimino
3cf2030653 initialize pointers to function within VP8DspInit()
makes testing easier, allowing a reset to C-version

Change-Id: I707d8338fedff4ae993e52eefe730c236ca3dcb5
2011-08-18 10:03:19 -07:00
James Zern
d21b479581 Merge "windows: add decode threading support" 2011-08-05 16:31:50 -07:00
James Zern
473ae95324 fix hang on thread creation failure
with assertions enabled the code would abort in
WebPWorkerChangeState with:
Assertion `worker->status_ >= OK'

without them the code would hang in the _cond_wait.

this change makes WebPWorkerChangeState a no-op in this case.

Change-Id: Iea855568bbdef2865ae61ab54473b3a7c230e91a
2011-08-05 15:01:16 -07:00
James Zern
fccca4202f windows: add decode threading support
Change-Id: Iad923550569ceec1ff469852ec68e66de3f6062b
2011-08-05 12:28:40 -07:00
Pascal Massimino
a31f843a9f Use the exact PNG_INCLUDES/PNG_LIBS when testing for -lpng
(and same for jpeg)
fixes issue #87

Change-Id: I857a828058f9653be21fe97437bf5a0d2c30835e
2011-08-02 11:58:02 -07:00
James Zern
ad9b45f1c8 Merge "Makefile.vc: rule maintenance" 2011-07-22 18:57:57 -07:00
James Zern
565a2cab22 Makefile.vc: rule maintenance
Remove the bulk of xcopys from the TARGET rule as things are built to
the correct location.
Allow the clean rule to be appended and only delete DLLINC for dll
builds to avoid prompting by erase when it's undefined.

Change-Id: If88b2c68090099777b8be9f3a5fbde2c25ed66a6
2011-07-22 18:56:36 -07:00
James Zern
2d0da681fb makefile.unix: disable Wvla by default
bad merge from fc7815d restored the flag.

Change-Id: Ifbc6c5f78f80934c0ddd26bcd2ea8ea110ab6385
2011-07-22 18:52:17 -07:00
Pascal Massimino
fc7815d692 multi-thread decoding: ~25-30% faster
To be enabled with the flag WEBP_USE_THREAD.
For now it's only available on unix (pthread), when using Makefile.unix
Will be switched on more generally later.

In-loop filtering and output (=rescaling/yuv->rgb conversion)
is done in parallel to bitstream decoding, lagging 1 row behind.

Example:
examples/dwebp bryce.webp -v
Time to decode picture: 0.680s

examples/dwebp bryce.webp -v -mt
Time to decode picture: 0.515s

Change-Id: Ic30a897423137a3bdace9c4e30465ef758fe53f2
2011-07-22 15:14:18 -07:00
Pascal Massimino
acd8ba4229 io->teardown() was not always called upon error
Change-Id: I8317139b583124fc20754b9b19aa4dba51cd05af
2011-07-20 07:49:57 -07:00
James Zern
c85527b1ae Merge "Makefile.vc: add DLL configs" 2011-07-19 13:38:45 -07:00
James Zern
e1e9be3502 cosmetics: spelling/grammar in README and lib headers
Change-Id: Ib8648adf652d29dd38887e5e07b09b4aa3965c6e
2011-07-15 18:58:56 -07:00
James Zern
b4d0ef8f58 Makefile.vc: add DLL configs
(release|debug)-dynamic
These configurations will produce a dll in bin/ and an import lib under
lib/.
Currently the -noasm switch in the examples will be disabled for these
builds due to a dependency on VP8EncGetCPUInfo.

Change-Id: I2cbac0064f0e500698d14ffc03200791ca837090
2011-07-15 15:25:02 -07:00
Pascal Massimino
998754a734 remove unused nb_i4_ and nb_i16_ fields.
Change-Id: Idf361e5528bddb3b25f3bc12502fdfd5c4cf9149
2011-07-15 14:57:07 -07:00
Pascal Massimino
9f01ce3afa rename WebPDecBuffer::memory -> private_memory
This makes it clear that it shouldn't be used externally.

Change-Id: I10c04c6606abbe851b6a3b424832803e842b2057
2011-07-15 14:49:01 -07:00
Pascal Massimino
fb5d659bbd fix an overflow bug in LUT calculation
round(clip()) != clip(round())

Change-Id: Ia53f845b62e01bce672456cb7cdf8581f1a7ce44
2011-07-14 22:22:46 -07:00
James Zern
d646d5c743 swig: add WebPDecodeARGB
Change-Id: I6f22cf0f87a7274f2ed63e4aa96267a8155a5e35
2011-07-14 19:05:55 -07:00
Pascal Massimino
78aeed4088 add missing WebPDecodeARGBInto() and switch ARGB4444 to RGBA4444 as was intended 2011-07-14 11:41:23 -07:00
James Zern
cd7c5292e9 explicitly mark library functions as extern
Add WEBP_EXTERN(type) macro which should make Windows DLL builds simpler
by allowing the signature to be changed.

Change-Id: I0cfa45dff779985680b1a38ddff30973a0d26639
2011-07-13 17:48:39 -07:00
Pascal Massimino
19db59f80f add support for RGB565, ARGB4444 and ARGB colorspace (decoder)
RGB565 and ARGB4444 are only supported through the advanced decoding API.
ARGB being somewhat generic, there's an easy WebPDecodeARGB()
new function for convenience.

Patch by Vikas Arora (vikaas dot arora at gmail dot com)

Change-Id: Ic7b6f72bd70aca458d14e7fdd23679212430ebca
2011-07-13 09:08:58 -07:00
Pascal Massimino
c915fb2aa7 encoder speed-up: hardcode special level values
1-3% faster

Change-Id: Ib2131989fbf819bcbfa6456adbeea0ba27c914f7
2011-07-12 18:21:22 -07:00
Pascal Massimino
c558bdad28 Rename and improve the API to retrieve decoded area
Change-Id: Iec7e0a1361c27dcf2dc8445170ab5b400454fce9
2011-07-12 13:59:51 -07:00
James Zern
bf599d74a4 Merge "makefile.unix: disable -Wvla by default" 2011-07-12 13:38:10 -07:00
Pascal Massimino
c9ea03d770 SSE2 version of strong filtering
~10% faster decoding
Patch by Somnath Banerjee (somnath at google dot com)

Change-Id: I10e380c036ff61afe24afc26084a508ab01e8502
2011-07-11 23:04:26 -07:00
James Zern
993af3e29a makefile.unix: disable -Wvla by default
Rather than add bulk to test for the new flag, simply comment out -Wvla
to keep the base makefile simple.
-Wvla was added in gcc-4.3.0.

Change-Id: I21feb456d7498ea628defb436a50c3d828e7f971
2011-07-08 21:23:27 -04:00
James Zern
3827e1bce4 Merge "examples: (windows/WIC) add alpha support" 2011-07-08 14:10:17 -07:00
Pascal Massimino
e291fae0fc SSE2 functions for the fancy upsampler.
~5-10% faster.
Heavy 8bit arithmetic trickery!
Patch by Somnath Banerjee (somnath at google dot com)

Change-Id: I9fd2c511d9f631e9cf4b008c46127b49fb527b47
2011-07-07 18:12:53 -07:00
Pascal Massimino
a06bbe2e80 add WebPISetIOHooks() to set some custom hooks on the incremental decoder object.
Change-Id: I01e973a1e45e3d60dc11fd284df3cbb938cf0485
2011-07-07 16:38:03 -07:00
pascal massimino
7643a6f2ef Merge "makefile.unix: use uname to detect OSX environment" 2011-07-07 16:29:25 -07:00
Pascal Massimino
5142a0be3a export alpha channel (if present) when dumping to PGM format
Change-Id: Ica1818937fa03b29a749887d28f49fe675c8b1db
2011-07-07 16:08:15 -07:00
James Zern
14d5731c7e makefile.unix: use uname to detect OSX environment
HOSTTYPE is x86_64 on e.g.,
bash 3.2.48(1)-release
ProductName:	Mac OS X
ProductVersion:	10.6.7
BuildVersion:	10J869

intel-mac seems to be the value under tcsh.

Change-Id: I814ad6d3b733933057cea605917b185ff6d423d0
2011-06-24 21:29:59 -04:00
James Zern
0805706252 examples: quiet warnings
When WEBP_HAVE_(JPEG|PNG) were undefined the stub functions would
produce warnings for unused parameters.

Change-Id: I79b2457c769d1f9382be834162de019c5427f94b
2011-06-24 16:50:02 -04:00
James Zern
3cfe0888ae examples: (windows/WIC) add alpha support
Portions based on a patch by Ismail Keskin (iskeskin at gmail dot com)
Fixes issue #83.

Change-Id: Ib54188910354240a2731b0a3a3d0915d17af2233
2011-06-22 12:01:48 -07:00
Pascal Massimino
13ed94b8ad add compile warning for variable-length-array 2011-06-20 17:42:25 -07:00
Pascal Massimino
5a18eb1a31 Merge "add Advanced Decoding Interface" 2011-06-20 16:30:01 -07:00
Pascal Massimino
5c4f27f9f5 add missing \n 2011-06-20 15:47:35 -07:00
Pascal Massimino
f4c4e416c0 80 cols fix 2011-06-20 15:39:40 -07:00
Pascal Massimino
d260310511 add Advanced Decoding Interface
You can now use WebPDecBuffer, WebPBitstreamFeatures and WebPDecoderOptions
to have better control over the decoding process (and the speed/quality tradeoff).

WebPDecoderOptions allow to:
 - turn fancy upsampler on/off
 - turn in-loop filter on/off
 - perform on-the-fly cropping
 - perform on the-fly rescale
(and more to come. Not all features are implemented yet).

On-the-fly cropping and scaling allow to save quite some memory
(as the decoding operation will now scale with the output's size, not
the input's one). It saves some CPU too (since for instance,
in-loop filtering is partially turned off where it doesn't matter,
and some YUV->RGB conversion operations are ommitted too).

The scaler uses summed area, so is mainly meant to be used for
downscaling (like: for generating thumbnails or previews).

Incremental decoding works with these new options.
More doc to come soon.

dwebp is now using the new decoding interface, with the new flags:
  -nofancy
  -nofilter
  -crop top left width height
  -scale width height

Change-Id: I08baf2fa291941686f4ef70a9cc2e4137874e85e
2011-06-20 15:30:52 -07:00
Pascal Massimino
bd2f65f67c sse2 version of the complex filter
12-15% faster.
(only inner edge is implemented for now)

patch by Somnath Banerjee (somnath at google dot com)
2011-06-20 00:27:47 -07:00
Pascal Massimino
96ed9ce0fb perform two idct transforms at a time when possible
patch by Christian Duvivier (cduvivier at google dot com)
2011-06-20 00:22:37 -07:00
Pascal Massimino
01af7b69cd use aligned stored 2011-06-18 08:27:16 -07:00
James Zern
0e1d1fdfe0 Merge "Makefile.vc: add experimental target" 2011-06-17 18:34:29 -07:00
James Zern
2a1292a61f Makefile.vc: add experimental target
Defines WEBP_EXPERIMENTAL_FEATURES for the build.

Change-Id: I1f118ca07018a92bfdf86f562781971d4382fc6e
2011-06-17 18:32:15 -07:00
James Zern
23bf351e71 Enable decode SSE2 for Visual Studio
Change-Id: If32f8b1cfe415b2f9330af36a5dd0e31e49582b2
2011-06-17 18:12:01 -07:00
Somnath Banerjee
131a4b7b7b dec/dsp_sse2: fix visual studio compile
This addresses issue #80

Change-Id: Ia81ae21f85266dd64d39da63ff2fae33f9a572dc
2011-06-17 18:04:08 -07:00
James Zern
00d9d6807c swig: file reorganization
Rather than %include'ing decode.h and potentially pickup new
(unsupported functions), explicitly list the desired functions as with
encode.
Reorganize a bit to contain most of the language specific additions to
one area. This fixes the visibility of the wrap_* functions in java. The
method modifiers need to come before the function prototypes.

Change-Id: I595df4d1a60edcb263923b5a2621879d3b6233cf
2011-06-17 14:32:15 -07:00
pascal massimino
7fc7e0d9eb Merge "swig/java: basic encode support" 2011-06-16 13:45:50 -07:00
Pascal Massimino
3be57b166f fix MSVC compile for WEBP_EXPERIMENTAL_FEATURES
patch by Ismail Keskin (iskeskin at gmail dot com)
fixes issue #82
2011-06-16 13:33:07 -07:00
James Zern
40a7e347ff dec/dsp: disable sse2 for Visual Studio builds
This is a temporary workaround for issue #80.

Currently dec/dsp_sse2 is not included in Makefile.vc. Simply adding it
will cause a build error, however, as cl does not support aligning
function parameters producing, e.g.,
src\dec\dsp_sse2.c(228) : error C2719: 'q1': formal parameter with
__declspec(align('16')) won't be aligned

Change-Id: Id29e6802dd29110e59c4f6d13ffa5d4793c750a0
2011-06-15 13:53:14 -07:00
Pascal Massimino
e4d540c842 add SSE2 code for transform
Pretty similar to the encoder's version
3% faster decoding on average

patch by Christian Duvivier (cduvivier at google dot com)
2011-06-15 10:51:31 -07:00
James Zern
54f2170a15 swig/java: basic encode support
Wrap WebPEncode???* to provide an interface similar to decode.
As only WebPGetEncoderVersion is wrapped directly from encode.h avoid
including it in the swig file to reduce %ignore's.
This change also removes unnecessary incremental decoding related enums.

Change-Id: I0b5424026aa6ae012c6a29ad2f2301c2681ca301
2011-06-15 10:37:53 -07:00
Pascal Massimino
c5d4584b2c call function pointers instead of C-version
will potentially call SSE2 version instead of the plain-C one
catch by Christian Duvivier (cduvivier at google dot com)
2011-06-14 18:57:50 -07:00
pascal massimino
ea43f045b5 Merge "configure: mingw32 targets: test for WIC support" 2011-06-10 16:11:51 -07:00
Pascal Massimino
a11009d7fc SSE2 version of simple in-loop filtering
~10% faster decoding

Patch by Somnath Banerjee (somnath at google dot com)

Change-Id: I200db408272b4f61cda9d9261d2d4370a698d6c4
2011-06-10 15:10:18 -07:00
Pascal Massimino
42548da9e3 shave one unneeded filter-cache line
There was 1 unneeded sample line allocated for the filter cache in of simple filtering.
+ Add an explaining comment.

Change-Id: I775a596c8b8643e773e0eade8aa341dc23fb290f
2011-06-09 12:08:08 -07:00
James Zern
31f9dc6fdd configure: mingw32 targets: test for WIC support
Replace usage of _WIN32 in examples as this does not guarantee the
presence of wincodec.h.
mingw-w64 notably includes wincodec.h, though other releases do not.

Under cygwin the following can be used for a WIC enabled binary:
./configure --target=i686-pc-mingw32 CC=i686-w64-mingw32-gcc.exe

Change-Id: Ica6a714c3356a8eaf88486a1c3f5aa6adde394c0
2011-06-08 11:32:41 -07:00
Pascal Massimino
1955969925 Merge "split expression in two." 2011-06-08 09:08:54 -07:00
Pascal Massimino
415dbe4625 split expression in two.
makes order of evaluation strictly defined.
(cf http://en.wikipedia.org/wiki/Sequence_point)

Suggestion by mr dot gnu dot jr at gmail dot com
2011-06-08 09:06:03 -07:00
James Zern
e29072a8ce configure: test for zlib only w/--enable-experimental
Only builds with --enable-experimental require zlib currently.
A base install of mingw will not include the development headers and
library. libwebp itself will now build in such environments.
Additionally, remove -lz from **/Makefile.am, -lz will be added to LIBS
by AC_CHECK_LIB when necessary.

Change-Id: Iae8319cdf00162ecb7ed44661c02f40beb34f155
2011-06-07 14:12:53 -07:00
James Zern
b2b0090b4c Simplify Visual Studio ifdefs
Use _MSC_VER as the intrinsics compile without /arch:SSE2 on x86.
Also avoids applying the same flag to all files which defeated the
purpose of the runtime cpu-detection.

Thanks to Frank B. for the suggestion!

Change-Id: Iae9933a3cee704e663d9bbd53d0fa68e8c025425
2011-06-03 11:40:15 -07:00
Pascal Massimino
ca7a2fd66d Add error reporting from encoding failures.
picture->error_code can be looked up for finer error diagnose.
Added readable error messages to cwebp too.

Should close bug #75 (http://code.google.com/p/webp/issues/detail?id=75)

Change-Id: I8889d06642d90702f698cd5c27441a058ddb3636
2011-06-02 07:06:39 -07:00
pascal massimino
6c9405dbfb Merge "Makefile.vc: require CFG with clean target" 2011-06-01 17:04:35 -07:00
James Zern
0424ecd996 Makefile.vc: require CFG with clean target
Allows clean to target a specific CFG output directory which has the
side benefit of supporting of tree builds.
Earlier versions would clean the entire source tree possibly removing
multiple builds.

Change-Id: I63c0f32f73c0035f2b7fae0a88c02de3805d264b
2011-06-01 15:55:22 -07:00
James Zern
003417c7c7 Enable SSE2 for Visual Studio builds
Based on the remnants of change #2273.
Adds an auto-detect for ARCH based on the environment.

Change-Id: I4644eae7509f3982a8b385b49beac03675a2e0e8
2011-06-01 12:57:18 -07:00
Pascal Massimino
af10db4aa4 little speed up for VP8BitUpdate()
1% faster on average

Patch by Somnath Banerjee (somnath at google dot com)

Change-Id: I44cbc125024d3b7ba8621643e9161b72f0eac281
2011-05-29 22:35:44 -07:00
Pascal Massimino
e71418f899 more MSVC files to ignore
original patch by Vladimir Panteleev (vladimir at thecybershadow dot net)
2011-05-27 10:07:51 -07:00
Pascal Massimino
46d90363af cosmetics 2011-05-27 10:04:36 -07:00
Pascal Massimino
edf59ab320 typo fix 2011-05-27 09:55:47 -07:00
Mikolaj Zalewski
72229f5f34 Add support for x64 and SSE2 builds under Windows. 2011-05-26 16:50:46 +02:00
Pascal Massimino
92e5c6e1d4 VP8GetInfo() + WebPResetDecParams()
- add an internal VP8GetInfo() to parse header
- add WebPResetDecParams() for proper initialization

Change-Id: Ic39ea634d1d8016d25bdcfef2cb0d00b6dad83e9
2011-05-19 19:17:05 -07:00
Pascal Massimino
416b7a6b95 raise the fixed-point precision for the rescaler
for super-large upscaling factor (32x and up), 20bits was not enough.

Change-Id: I7b0d1975d0609948d464cfc3aeff1a70df16dc57
2011-05-16 17:12:37 -07:00
Pascal Massimino
aa87e4e063 fix alignment
Change-Id: I72d2c7a0faf240d3cada1b6fb86ffb32bead3eb2
2011-05-16 14:39:47 -07:00
Pascal Massimino
eb66670c6f disable WEBP_EXPERIMENTAL_FEATURES
Change-Id: I39caa72d261a0fd668910b1bfe067489c48a2de1
2011-05-09 12:10:28 -07:00
Pascal Massimino
c5ae7f653a typo fix: USE_ => WEBP_
Change-Id: I8d31320ff8a28bc89b943606a695421f9eda692d
2011-05-06 20:09:41 -07:00
James Zern
d041efae00 swig: add libwebp.jar/libwebp_java_wrap.c
for the swig-impaired

Change-Id: I10711c057c11bd9e59bc382fe3c6337883f5c4a6
2011-05-06 18:56:01 -07:00
James Zern
f6fb3877ed add swig interface
Currently only supports a subset of decode functions and likely only
works fully for java.

For java bindings:
The generated java source can be compiled and the class files added to
libwebp.jar.
The generated jni source can be compiled to, e.g., libwebp_jni.so, which
can then be loaded via System.loadLibrary("webp_jni").

Change-Id: I8225933cbaf85c9cfa4b78c2e5f167cee8b62408
2011-05-06 16:41:00 -07:00
Pascal Massimino
e9273902f1 align buffer for double too
sometimes, gcc insert sse2 storeu instructions (like in VP8InitFilter())
with aligment requirements.

Bug was visible 'sometimes' in non-debug mode, when trying to use -af.

Change-Id: If3ec282bbbb9f9d0d33ca4b2c4bed46cd26fe495
2011-05-05 19:32:38 -07:00
Pascal Massimino
842c009b38 fix -strong option
was checking the presence of a useless second argument

Change-Id: I2c524a79fc7317cb8b6146c10265ee9462cd1958
2011-05-05 18:10:08 -07:00
Pascal Massimino
d0a7038792 Merge "cosmetics" 2011-05-05 16:47:19 -07:00
Pascal Massimino
fc0a02e55d fix the dichotomy loop
we were reading past the end of the dqs[] array.

reported by Mathias Schindler (on cygwin only)
http://code.google.com/p/webp/issues/detail?id=71

Change-Id: Ib38c4c139e3cac3e8915626d63e16b403d6bbd63
2011-05-05 16:35:12 -07:00
Pascal Massimino
38369c03e0 cosmetics
Change-Id: I39c3bc671ab427dcb3e23aedee7fe8d81f4cb860
2011-05-04 23:25:40 -07:00
Pascal Massimino
8dfc4c6f17 factorize and unify GetAlpha() between the C and SSE2 version
patch by Christian Duvivier (cduvivier at google dot com)

Change-Id: I47ac75010aa4036cf09f13d23043e654c4966a00
2011-05-04 17:02:35 -07:00
Pascal Massimino
6d0e66c23e prepare experimentation with yuv444 / 422
+ add a simple rescaling function: WebPPictureRescale() for encoding
+ clean-up the memory managment around the alpha plane
+ fix some includes path by using "../webp/xxx.h" instead of "webp/xxx.h"

New flags for 'cwebp':
 -resize <width> <height>
 -444  (no effect)
 -422  (no effect)
 -400

Change-Id: I25a95f901493f939c2dd789e658493b83bd1abfa
2011-05-04 15:41:08 -07:00
Pascal Massimino
79cc49f5eb add a --enable-experimental option to './configure'
This will enable USE_EXPERIMENTAL_FEATURES instead of having a header-level #define

Change-Id: I860b007f64220ddf92d0becd18f7d100c718f8d1
2011-05-04 15:17:54 -07:00
Pascal Massimino
d757523889 sse2 version of CollectHistogram()
~3% faster encoding.

Patch by Christian Duvivier (cduvivier at google dot com)

Change-Id: I8c11d63d0cffb35e145fe0ea74cb66a53f4950d9
2011-04-28 17:00:57 -07:00
Pascal Massimino
c1c728d617 add an extra #ifdef WEBP_EXPERIMENTAL_FEATURES to avoid 'unused variable' warning
Change-Id: I8a42781eab1fad75e13cb85ac3f768d2405726e0
2011-04-28 16:31:14 -07:00
Pascal Massimino
60c61d2d54 always call VP*EncDeleteAlpha() unconditionnally, for simplicity
Change-Id: I145f94adf6ea11618170c0955e6ef2fa60756f1f
2011-04-28 16:30:48 -07:00
Pascal Massimino
0f8c63849d simply don't call WriteExtensions() if WEBP_EXPERIMENTAL_FEATURES is not defined
Change-Id: I81867dcc505376c800ba65d81f6a50e02161e707
2011-04-28 16:03:34 -07:00
Pascal Massimino
47c661d50f rename swap -> swap_rb
Change-Id: I054a4517376a027611cffb2484b22248adae2b77
2011-04-28 16:01:46 -07:00
Pascal Massimino
10d55bbb06 move chunk[] declaration out of the for() loop
Change-Id: I10df4b5d6d186bae2b9b1a174aab13c401c54b01
2011-04-28 15:52:44 -07:00
Pascal Massimino
517cec21b9 fix indentation
Change-Id: I868ec38999dc5249e9b93048049dd51422113677
2011-04-28 15:52:06 -07:00
Pascal Massimino
f7d9e261c5 fix merge problems
Change-Id: I6d0763734139d2852896872c80c5e7fa7889945a
2011-04-26 11:02:38 -07:00
Pascal Massimino
8fd42b3a81 add a stride 'a_stride' for the alpha plane
(instead of hardcoding it to 'width')

Change-Id: Ibce97285bfe8ff4aea45b5797f53a53d3f60adab
2011-04-26 07:31:57 -07:00
Pascal Massimino
b8dcbf2f35 fix alpha-plane copy and crop methods
the alpha-plane was not reallocated properly.

Change-Id: I5db445c7086b3c7c5cf98631d714350119dd7c2a
2011-04-26 06:37:45 -07:00
Pascal Massimino
cdef89de0e fix some 'unused variable' warning
Change-Id: I1d2747480675b68f57d7ef7bf0b0010d267cb32b
2011-04-26 06:34:39 -07:00
Pascal Massimino
fb29c26201 SSE2 version of the fwd transform and the squared sum metric
average 10% faster encoding.

Patch by Christian Duvivier (cduvivier at google dot com)

Change-Id: Iff1bba402b280522af323f73e0c817a3d665354a
2011-04-26 00:03:55 -07:00
Pascal Massimino
2ab4b72f53 EXPERIMENTAL: add support for alpha channel
This is a (minor) bitstream change: if the 'color_space' bit is set to '1'
(which is normally an undefined/invalid behaviour), we add extra data at the
end of partition #0 (so-called 'extensions')

Namely, we add the size of the extension data as 3 bytes (little-endian),
followed by a set of bits telling which extensions we're incorporating.
The data then _preceeds_ this trailing tags.

This is all experimental, and you'll need to have
'#define WEBP_EXPERIMENTAL_FEATURES' in webp/types.h to enable this code
(at your own risk! :))

Still, this hack produces almost-valid WebP file for decoders that don't
check this color_space bit. In particular, previous 'dwebp' (and for instance
Chrome) will recognize this files and decode them, but without the alpha
of course. Other decoder will just see random extra stuff at the end of
partition #0.

To experiment with the alpha-channel, you need to compile on Unix platform
and use PNGs for input/output.

If 'alpha.png' is a source with alpha channel, then you can try (on Unix):

  cwebp alpha.png -o alpha.webp
  dwebp alpha.webp -o test.png

cwebp now has a '-noalpha' flag to ignore any alpha information from the
source, if present.

More hacking and experimenting welcome!

Change-Id: I3c7b1fd8411c9e7a9f77690e898479ad85c52f3e
2011-04-25 23:29:39 -07:00
Pascal Massimino
cfbf88a6c4 add SSE2 functions. ~2x faster encoding on average.
For now, SSE2 functions are compiled a-minima: only on platforms
where __SSE2__ is defined. Let's later add some autoconf-based
config to enable/disable at will.

One can disable SSE2 at run-time by hooking-up VP8GetInfo.
There is a new option "-noasm" in cwebp for that.
Output should be binary the same between C and SSE2 version. If not,
that's a bug!

patch by Christian Duvivier (cduvivier at google dot com)

Change-Id: Iae006c3cdcb7e8280e846cedb94d239dab1e42ae
2011-04-22 15:34:23 -07:00
Pascal Massimino
e7ff3f9af6 merge two ITransforms together when applicable and change the TTransform
to return the sum directly.

output is bitwise the same, speed up 1-2%. This is preparatory to a
more efficient SSE2 implementation.

Change-Id: I0bcdf05808c93420fbe9dcb75e5e7e55a4ae5b89
2011-04-21 13:35:56 -07:00
Pascal Massimino
ca554137d2 fix WebPIDecGetRGB() to accept any RGB(A) mode, not just MODE_RGB
Change-Id: I8780582ecd0868c84e2b1310addebd6c8989e727
2011-04-20 08:42:08 -07:00
Pascal Massimino
8aa50efd38 fix some 'man' typos 2011-04-18 18:50:34 -07:00
77 changed files with 11561 additions and 1818 deletions

3
.gitattributes vendored Normal file
View File

@@ -0,0 +1,3 @@
.gitattributes export-ignore
.gitignore export-ignore
.mailmap export-ignore

4
.gitignore vendored
View File

@@ -8,6 +8,7 @@
/config.*
/configure
/depcomp
/dist
/install-sh
/libtool
/ltmain.sh
@@ -16,3 +17,6 @@
Makefile
Makefile.in
examples/[cd]webp
/output
*.idb
*.pdb

2
.mailmap Normal file
View File

@@ -0,0 +1,2 @@
Mikołaj Zalewski <mikolajz@google.com>
Pascal Massimino <pascal.massimino@gmail.com>

View File

@@ -1,7 +1,9 @@
Contributors:
- James Zern (jzern at google dot com)
- Jan Engelhardt (jengelh at medozas dot de)
- Mikołaj Zalewski (mikolajz at google dot com)
- Pascal Massimino (pascal dot massimino at gmail dot com)
- pierre.php@gmail.com
- Somnath Banerjee (somnath at google dot com)
- Pierre Joye (pierre dot php at gmail dot com)
- Somnath Banerjee (somnath dot banerjee at gmail dot com)
- Urvang Joshi (urvang at google dot com)
- Vikas Arora (vikasa at google dot com)

View File

@@ -2,29 +2,41 @@ LOCAL_PATH:= $(call my-dir)
include $(CLEAR_VARS)
LOCAL_SRC_FILES := \
src/dec/bits.c \
src/dec/alpha.c \
src/dec/dsp.c \
src/dec/frame.c \
src/dec/idec.c \
src/dec/layer.c \
src/dec/quant.c \
src/dec/tree.c \
src/dec/vp8.c \
src/dec/webp.c \
src/dec/yuv.c \
src/dec/io.c \
src/dec/buffer.c \
src/dsp/yuv.c \
src/dsp/upsampling.c \
src/dsp/cpu.c \
src/dsp/dec.c \
src/dsp/dec_neon.c \
src/dsp/enc.c \
src/enc/alpha.c \
src/enc/analysis.c \
src/enc/bit_writer.c \
src/enc/config.c \
src/enc/dsp.c \
src/enc/filter.c \
src/enc/frame.c \
src/enc/iterator.c \
src/enc/layer.c \
src/enc/picture.c \
src/enc/quant.c \
src/enc/syntax.c \
src/enc/tree.c \
src/enc/webpenc.c
src/utils/bit_reader.c \
src/utils/bit_writer.c \
src/utils/thread.c \
LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD -DWEBP_USE_THREAD \
-finline-functions -frename-registers -ffast-math \
-s -fomit-frame-pointer -Isrc/webp

142
ChangeLog
View File

@@ -1,4 +1,144 @@
d3dc2d4 update ChangeLog
dfc9c1e Harmonize the dates
28ad70c Fix PNG decoding bug
846e93c Update AUTHORS & add .mailmap
563e52d cosmetics after '76036f5 Refactor decoder library'
76036f5 Refactor decoder library
377ef43 configure.ac: update AC_INIT params
7a8d876 use a user-visible MACRO for max width/height.
d4e9f55 NEON decode support in WebP
0ee683b update libtool version-info
fdbe02c windows: match _cond_destroy logic w/return variable name
206b686 README: correct advanced decode api pseudo-code
6a32a0f make VP8BitReader a typedef, for better re-use
b112e83 create a libwebputils under src/utils
ee697d9 harmonize the include guards and #endif comments
a1ec07a Fixing compiler error in non x86 arch.
dcfa509 Fixed recursive inclusion of bit_writer.h and vp8enci.h.
e06ac08 create a separate libwebpdsp under src/dsp
ebeb412 use unsigned int for bitfields
341cc56 make kNewRange a static array
227a91e README: minor wording update
05bd8e6 add man pages to dist
812dfa1 bump up versions in preparations for 0.1.3
a5b78c8 wrap alpha-related options under WEBP_EXPERIMENTAL_FEATURES flag
34dc790 regen ChangeLog for 0.1.3-rc2
7c43663 Silence some (more) Visual Studio warnings.
60306e8 add top-level gitattributes
2aa6b80 Slience some Visual Studio warnings.
4cbbb29 Merge "bump up version for next freeze"
a329167 bump up version for next freeze
c7e86ab cosmetics: fix comment line lengths
c9e037a makefile.unix: add simple dist target
87d58ce makefile.unix: rule maintenance
d477de7 mend
fac15ec Update NEWS & README for next release V0.1.3
6215595 Merge "add a -partition_limit option to limit the number of bits used by intra4x4"
3814b76 Merge "reorganize chunk-parsing code"
900286e add a -partition_limit option to limit the number of bits used by intra4x4
cd12b4b add the missing cost for I4/I16 mode selection
dfcc213 reorganize chunk-parsing code
3cf2030 initialize pointers to function within VP8DspInit()
d21b479 Merge "windows: add decode threading support"
473ae95 fix hang on thread creation failure
fccca42 windows: add decode threading support
a31f843 Use the exact PNG_INCLUDES/PNG_LIBS when testing for -lpng
ad9b45f Merge "Makefile.vc: rule maintenance"
565a2ca Makefile.vc: rule maintenance
2d0da68 makefile.unix: disable Wvla by default
fc7815d multi-thread decoding: ~25-30% faster
acd8ba4 io->teardown() was not always called upon error
c85527b Merge "Makefile.vc: add DLL configs"
e1e9be3 cosmetics: spelling/grammar in README and lib headers
b4d0ef8 Makefile.vc: add DLL configs
998754a remove unused nb_i4_ and nb_i16_ fields.
9f01ce3 rename WebPDecBuffer::memory -> private_memory
fb5d659 fix an overflow bug in LUT calculation
d646d5c swig: add WebPDecodeARGB
78aeed4 add missing WebPDecodeARGBInto() and switch ARGB4444 to RGBA4444 as was intended
cd7c529 explicitly mark library functions as extern
19db59f add support for RGB565, ARGB4444 and ARGB colorspace (decoder)
c915fb2 encoder speed-up: hardcode special level values
c558bda Rename and improve the API to retrieve decoded area
bf599d7 Merge "makefile.unix: disable -Wvla by default"
c9ea03d SSE2 version of strong filtering
993af3e makefile.unix: disable -Wvla by default
3827e1b Merge "examples: (windows/WIC) add alpha support"
e291fae SSE2 functions for the fancy upsampler.
a06bbe2 add WebPISetIOHooks() to set some custom hooks on the incremental decoder object.
7643a6f Merge "makefile.unix: use uname to detect OSX environment"
5142a0b export alpha channel (if present) when dumping to PGM format
14d5731 makefile.unix: use uname to detect OSX environment
0805706 examples: quiet warnings
3cfe088 examples: (windows/WIC) add alpha support
13ed94b add compile warning for variable-length-array
5a18eb1 Merge "add Advanced Decoding Interface"
5c4f27f add missing \n
f4c4e41 80 cols fix
d260310 add Advanced Decoding Interface
bd2f65f sse2 version of the complex filter
96ed9ce perform two idct transforms at a time when possible
01af7b6 use aligned stored
0e1d1fd Merge "Makefile.vc: add experimental target"
2a1292a Makefile.vc: add experimental target
23bf351 Enable decode SSE2 for Visual Studio
131a4b7 dec/dsp_sse2: fix visual studio compile
00d9d68 swig: file reorganization
7fc7e0d Merge "swig/java: basic encode support"
3be57b1 fix MSVC compile for WEBP_EXPERIMENTAL_FEATURES
40a7e34 dec/dsp: disable sse2 for Visual Studio builds
e4d540c add SSE2 code for transform
54f2170 swig/java: basic encode support
c5d4584 call function pointers instead of C-version
ea43f04 Merge "configure: mingw32 targets: test for WIC support"
a11009d SSE2 version of simple in-loop filtering
42548da shave one unneeded filter-cache line
31f9dc6 configure: mingw32 targets: test for WIC support
1955969 Merge "split expression in two."
415dbe4 split expression in two.
e29072a configure: test for zlib only w/--enable-experimental
b2b0090 Simplify Visual Studio ifdefs
ca7a2fd Add error reporting from encoding failures.
6c9405d Merge "Makefile.vc: require CFG with clean target"
0424ecd Makefile.vc: require CFG with clean target
003417c Enable SSE2 for Visual Studio builds
af10db4 little speed up for VP8BitUpdate()
e71418f more MSVC files to ignore
46d9036 cosmetics
edf59ab typo fix
72229f5 Add support for x64 and SSE2 builds under Windows.
92e5c6e VP8GetInfo() + WebPResetDecParams()
416b7a6 raise the fixed-point precision for the rescaler
aa87e4e fix alignment
eb66670 disable WEBP_EXPERIMENTAL_FEATURES
c5ae7f6 typo fix: USE_ => WEBP_
d041efa swig: add libwebp.jar/libwebp_java_wrap.c
f6fb387 add swig interface
e927390 align buffer for double too
842c009 fix -strong option
d0a7038 Merge "cosmetics"
fc0a02e fix the dichotomy loop
38369c0 cosmetics
8dfc4c6 factorize and unify GetAlpha() between the C and SSE2 version
6d0e66c prepare experimentation with yuv444 / 422
79cc49f add a --enable-experimental option to './configure'
d757523 sse2 version of CollectHistogram()
c1c728d add an extra #ifdef WEBP_EXPERIMENTAL_FEATURES to avoid 'unused variable' warning
60c61d2 always call VP*EncDeleteAlpha() unconditionnally, for simplicity
0f8c638 simply don't call WriteExtensions() if WEBP_EXPERIMENTAL_FEATURES is not defined
47c661d rename swap -> swap_rb
10d55bb move chunk[] declaration out of the for() loop
517cec2 fix indentation
f7d9e26 fix merge problems
8fd42b3 add a stride 'a_stride' for the alpha plane
b8dcbf2 fix alpha-plane copy and crop methods
cdef89d fix some 'unused variable' warning
fb29c26 SSE2 version of the fwd transform and the squared sum metric
2ab4b72 EXPERIMENTAL: add support for alpha channel
cfbf88a add SSE2 functions. ~2x faster encoding on average.
e7ff3f9 merge two ITransforms together when applicable and change the TTransform to return the sum directly.
ca55413 fix WebPIDecGetRGB() to accept any RGB(A) mode, not just MODE_RGB
8aa50ef fix some 'man' typos
d3f3bdd update ChangeLog (v0.1.2)
d7e9a69 update contributor list
261abb8 add a 'superclean' section
276ae82 Remove files not mean to be in git, and update .gitignore

View File

@@ -7,15 +7,26 @@ LIB_NAME_DEBUG = libwebp_a_debug
#
# Stem for DLL import libs
#
IMPLIB_NAME = libwebp
IMPLIB_NAME_DEBUG = libwepb_debug
IMPLIB_NAME = libwebp_dll
IMPLIB_NAME_DEBUG = libwebp_dll_debug
!IFNDEF DEP_PATH
DEPS_PATH = ../../deps
!ENDIF
!IFNDEF ARCH
ARCH = x86
!IF ! [ cl 2>&1 | find "x86" > NUL ]
ARCH = x86
!ELSE IF ! [ cl 2>&1 | find "x64" > NUL ]
ARCH = x64
!ELSE
!ERROR Unable to auto-detect toolchain architecture! \
If cl.exe is in your PATH rerun nmake with ARCH=<arch>.
!ENDIF
!ENDIF
!IF "$(ARCH)" == "x86"
PLATFORM_LDFLAGS = /SAFESEH
!ENDIF
#############################################################
@@ -24,14 +35,14 @@ ARCH = x86
MT = mt.exe
CCNODBG = cl.exe /nologo /O2 /DNDEBUG
CCDEBUG = cl.exe /nologo /Od /Gm /Zi /D_DEBUG /RTC1
CFLAGS = /Isrc /nologo /W3 /EHsc /DWIN32 /FD /c /GS /D_CRT_SECURE_NO_WARNINGS
LDFLAGS = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /SAFESEH /DYNAMICBASE
CFLAGSLIB = /DLIBWEBP_STATICLIB
CFLAGS = /Isrc /nologo /W3 /EHsc /FD /c /GS
CFLAGS = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
CFLAGS = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
LDFLAGS = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE $(PLATFORM_LDFLAGS)
LNKDLL = link.exe /DLL
LNKLIB = link.exe /lib
LNKEXE = link.exe
LFLAGS = /nologo /machine:$(ARCH)
CFLAGS = $(CFLAGS)
CFGSET = FALSE
!IF "$(OBJDIR)" == ""
@@ -55,19 +66,35 @@ DIRLIB = $(DIRBASE)\lib
DIRINC = $(DIRBASE)\include
DIRBIN = $(DIRBASE)\bin
# release-static
# Target configuration
!IF "$(CFG)" == "release-static"
TARGET = $(LIB_NAME).lib
LNK = $(LNKLIB) /out:$(DIRLIB)\$(TARGET)
CC = $(CCNODBG) $(RTLIB) $(CFLAGSLIB)
CFGSET = TRUE
TARGET = $(LIB_NAME).lib
CC = $(CCNODBG)
STATICLIBBUILD = TRUE
!ELSE IF "$(CFG)" == "debug-static"
TARGET = $(LIB_NAME_DEBUG).lib
CC = $(CCDEBUG)
STATICLIBBUILD = TRUE
!ELSE IF "$(CFG)" == "release-dynamic"
TARGETDLL = $(LIB_NAME).dll
TARGET = $(IMPLIB_NAME).lib
CC = $(CCNODBG)
DLLBUILD = TRUE
!ELSE IF "$(CFG)" == "debug-dynamic"
TARGETDLL = $(LIB_NAME_DEBUG).dll
TARGET = $(IMPLIB_NAME_DEBUG).lib
CC = $(CCDEBUG)
DLLBUILD = TRUE
!ENDIF
# debug-static
!IF "$(CFG)" == "debug-static"
TARGET = $(LIB_NAME_DEBUG).lib
!IF "$(STATICLIBBUILD)" == "TRUE"
CC = $(CC) $(RTLIB)
LNK = $(LNKLIB) /out:$(DIRLIB)\$(TARGET)
CC = $(CCDEBUG) $(RTLIBD) $(CFLAGSLIB)
CFGSET = TRUE
!ELSE IF "$(DLLBUILD)" == "TRUE"
DLLINC = webp_dll.h
CC = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
LNK = $(LNKDLL) /out:$(DIRBIN)\$(TARGETDLL) /implib:$(DIRLIB)\$(TARGET)
CFGSET = TRUE
!ENDIF
@@ -75,13 +102,18 @@ CFGSET = TRUE
# Usage
#
!IF "$(CFGSET)" == "FALSE"
!MESSAGE Usage: nmake /f makefile.vc9 [CFG=<config>] [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
!MESSAGE Usage: nmake /f Makefile.vc [CFG=<config>] [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
!MESSAGE where <config> is one of:
!MESSAGE - release-static - release static library
!MESSAGE - debug-static - debug static library
!MESSAGE - (empty) - perform a clean
!MESSAGE - release-dynamic - release dynamic link library (DLL)
!MESSAGE - debug-dynamic - debug dynamic link library (DLL)
!MESSAGE <target> may be:
!MESSAGE - clean - perform a clean for CFG
!MESSAGE - experimental - build CFG with experimental
!MESSAGE . features enabled. Requires zlib.
!MESSAGE
!MESSAGE <rtlibcfg> controls the runtime library likage - can be 'static' or 'dynamic'.
!MESSAGE <rtlibcfg> controls the runtime library linkage - can be 'static' or 'dynamic'.
!MESSAGE <target> can be left blank in which case all is assumed
!MESSAGE <path> is the path where you like to build (obj, bins, etc.)
!MESSAGE default to ..\obj\
@@ -93,39 +125,26 @@ CFGSET = TRUE
!ENDIF
#######################
# Only the clean target can be used if a config was not provided.
# Rules
#
!IF "$(CFGSET)" == "FALSE"
!MESSAGE
!MESSAGE No configuration provided - performing a clean.
clean:
@-erase /s *.dll 2> NUL
@-erase /s *.exp 2> NUL
@-erase /s *.idb 2> NUL
@-erase /s *.lib 2> NUL
@-erase /s *.obj 2> NUL
@-erase /s *.pch 2> NUL
@-erase /s *.pdb 2> NUL
@-erase /s *.res 2> NUL
!ELSE
!IF "$(CFGSET)" == "TRUE"
# A config was provided, so the library can be built.
#
X_OBJS= \
$(DIROBJ)\dec\bits.obj \
$(DIROBJ)\dec\dsp.obj \
$(DIROBJ)\dec\frame.obj \
$(DIROBJ)\dec\quant.obj \
$(DIROBJ)\dec\tree.obj \
$(DIROBJ)\dec\vp8.obj \
$(DIROBJ)\dec\webp.obj \
$(DIROBJ)\dec\yuv.obj \
$(DIROBJ)\dec\io.obj \
$(DIROBJ)\dec\buffer.obj \
$(DIROBJ)\dec\idec.obj \
$(DIROBJ)\dec\alpha.obj \
$(DIROBJ)\dec\layer.obj \
$(DIROBJ)\enc\analysis.obj \
$(DIROBJ)\enc\bit_writer.obj \
$(DIROBJ)\enc\config.obj \
$(DIROBJ)\enc\cost.obj \
$(DIROBJ)\enc\dsp.obj \
$(DIROBJ)\enc\frame.obj \
$(DIROBJ)\enc\filter.obj \
$(DIROBJ)\enc\iterator.obj \
@@ -134,6 +153,19 @@ X_OBJS= \
$(DIROBJ)\enc\syntax.obj \
$(DIROBJ)\enc\tree.obj \
$(DIROBJ)\enc\webpenc.obj \
$(DIROBJ)\enc\alpha.obj \
$(DIROBJ)\enc\layer.obj \
$(DIROBJ)\dsp\enc.obj \
$(DIROBJ)\dsp\enc_sse2.obj \
$(DIROBJ)\dsp\upsampling.obj \
$(DIROBJ)\dsp\upsampling_sse2.obj \
$(DIROBJ)\dsp\dec.obj \
$(DIROBJ)\dsp\dec_sse2.obj \
$(DIROBJ)\dsp\cpu.obj \
$(DIROBJ)\dsp\yuv.obj \
$(DIROBJ)\utils\bit_reader.obj \
$(DIROBJ)\utils\bit_writer.obj \
$(DIROBJ)\utils\thread.obj \
$(RESOURCE)
EXAMPLES_OBJS = \
@@ -142,18 +174,25 @@ EXAMPLES_OBJS = \
all: $(DIRLIB)\$(TARGET) $(DIRBIN)\dwebp.exe $(DIRBIN)\cwebp.exe
# Additional include and library paths (for zlib) can be passed via the CL and
# LINK environment variables respectively:
# > set CL=/I\zlib\include
# > set LINK=\zlib\zlib.lib
# > nmake /f Makefile.vc CFG=release-static experimental
experimental:
$(MAKE) /f Makefile.vc \
CFG=$(CFG) CFLAGS="$(CFLAGS) /DWEBP_EXPERIMENTAL_FEATURES" /$(MAKEFLAGS)
$(DIRLIB)\$(TARGET): $(X_OBJS)
$(LNK) $(LFLAGS) $(X_OBJS)
-xcopy $(DIROBJ)\$(LIB_NAME).dll $(DIRBIN) /y
-xcopy $(DIROBJ)\$(LIB_NAME).lib $(DIRLIB) /y
-xcopy $(DIROBJ)\$(LIB_NAME_DEBUG).dll $(DIRBIN) /y
-xcopy $(DIROBJ)\$(LIB_NAME_DEBUG).lib $(DIRLIB) /y
-xcopy $(DIROBJ)\$(IMPLIB_NAME).lib $(DIRLIB) /y
-xcopy $(DIROBJ)\$(IMPLIB_NAME_DEBUG).lib $(DIRLIB) /y
-xcopy $(DIROBJ)\*.exp $(DIRLIB) /y
-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y
-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y
$(X_OBJS): $(DIROBJ)\enc $(DIROBJ)\dec $(DIRLIB) $(DIRINC) $(DIRBIN)
$(X_OBJS): $(DIROBJ)\enc $(DIROBJ)\dec $(DIROBJ)\dsp $(DIROBJ)\utils $(DIRLIB) $(DIRINC) $(DIRBIN)
!IF "$(DLLBUILD)" == "TRUE"
$(X_OBJS): $(DIROBJ)\$(DLLINC)
clean::
@-erase /s $(DIROBJ)\$(DLLINC) 2> NUL
!ENDIF
$(EXAMPLES_OBJS): $(DIROBJ)\examples $(DIRLIB)\$(TARGET)
@@ -166,6 +205,12 @@ $(DIROBJ)\examples:
$(DIROBJ)\dec:
@if not exist "$(DIROBJ)\dec" mkdir $(DIROBJ)\dec
$(DIROBJ)\dsp:
@if not exist "$(DIROBJ)\dsp" mkdir $(DIROBJ)\dsp
$(DIROBJ)\utils:
@if not exist "$(DIROBJ)\utils" mkdir $(DIROBJ)\utils
$(DIRLIB):
@if not exist "$(DIRLIB)" mkdir $(DIRLIB)
@@ -175,6 +220,13 @@ $(DIRINC):
$(DIRBIN):
@if not exist "$(DIRBIN)" mkdir $(DIRBIN)
# generate a helper include to define WEBP_EXTERN suitable for the DLL build
$(DIROBJ)\$(DLLINC):
@echo #ifndef WEBP_DLL_H_ > $@
@echo #define WEBP_DLL_H_ >> $@
@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
@echo #endif /* WEBP_DLL_H_ */ >> $@
.SUFFIXES: .c .obj .res .exe
{examples}.c{$(DIROBJ)\examples}.obj:
$(CC) $(CFLAGS) /Fo"$@" $<
@@ -182,10 +234,24 @@ $(DIRBIN):
$(CC) $(CFLAGS) /Fo"$@" $<
{src\enc}.c{$(DIROBJ)\enc}.obj:
$(CC) $(CFLAGS) /Fo"$@" $<
{src\dsp}.c{$(DIROBJ)\dsp}.obj:
$(CC) $(CFLAGS) /Fo"$@" $<
{src\utils}.c{$(DIROBJ)\utils}.obj:
$(CC) $(CFLAGS) /Fo"$@" $<
{$(DIROBJ)\examples}.obj{$(DIRBIN)}.exe:
$(LNKEXE) $(LDFLAGS) /OUT:"$@" $< ole32.lib windowscodecs.lib shlwapi.lib $(DIRLIB)\$(TARGET)
$(MT) -manifest $@.manifest -outputresource:$@;1
del $@.manifest
clean::
@-erase /s $(DIROBJ)\*.dll 2> NUL
@-erase /s $(DIROBJ)\*.exp 2> NUL
@-erase /s $(DIROBJ)\*.idb 2> NUL
@-erase /s $(DIROBJ)\*.lib 2> NUL
@-erase /s $(DIROBJ)\*.obj 2> NUL
@-erase /s $(DIROBJ)\*.pch 2> NUL
@-erase /s $(DIROBJ)\*.pdb 2> NUL
@-erase /s $(DIROBJ)\*.res 2> NUL
!ENDIF # End of case where a config was provided.

9
NEWS
View File

@@ -1,3 +1,12 @@
- 9/19/11: version 0.1.3
* Advanced decoding APIs.
* On-the-fly cropping and rescaling of images.
* SSE2 instructions for decoding performance optimizations on x86 based platforms.
* Support Multi-threaded decoding.
* 40% improvement in Decoding performance.
* Add support for RGB565, RGBA4444 & ARGB image colorspace.
* Better handling of large picture encoding.
- 3/25/11: version 0.1.2
* Incremental decoding: picture can be decoded byte-by-byte if needs be.
* lot of bug-fixes, consolidation and stabilization

169
README
View File

@@ -4,12 +4,12 @@
\__\__/\____/\_____/__/ ____ ___
/ _/ / \ \ / _ \/ _/
/ \_/ / / \ \ __/ \__
\____/____/\_____/_____/____/v0.1.2
\____/____/\_____/_____/____/v0.1.3
Description:
============
WEBP codec: Library to encode and decode images in WebP format. This package
WebP codec: library to encode and decode images in WebP format. This package
contains the library that can be used in other programs to add WebP support,
as well as the command line tools 'cwebp' and 'dwebp'.
@@ -32,9 +32,11 @@ By running:
nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output
the directory output\release-static\x86\bin will contain the tools
cweb.exe and dweb.exe. The directory output\release-static\x86\lib will
contains the libwebp static library.
the directory output\release-static\(x64|x86)\bin will contain the tools
cwebp.exe and dwebp.exe. The directory output\release-static\(x64|x86)\lib will
contain the libwebp static library.
The target architecture (x86/x64) is detected by Makefile.vc from the Visual
Studio compiler (cl.exe) available in the system path.
Unix build using makefile.unix:
-------------------------------
@@ -56,9 +58,6 @@ Using autoconf tools:
make
make install
Note: In case './configure' step fails, try generating configure & appropriate
Makefile(s) via command 'aclocal && autoconf && automake -a -c;'.
should be all you need to have the following files
/usr/local/include/webp/decode.h
@@ -73,9 +72,37 @@ installed.
Note: The encoding and decoding libraries are compiled separately
(as src/dec/libwebpdecode.* and src/dec/libwebpencode.*). They
can be installed independently using a minor modifications in the
can be installed independently using a minor modification in the
corresponding Makefile.am configure files (see comments there).
SWIG bindings:
--------------
To generate language bindings from swig/libwebp.i swig-1.3
(http://www.swig.org) is required. 2.0 may work, but has not been tested.
Currently the following functions are mapped:
Decode:
WebPGetDecoderVersion
WebPGetInfo
WebPDecodeRGB
WebPDecodeRGBA
WebPDecodeARGB
WebPDecodeBGR
WebPDecodeBGRA
Encode:
WebPGetEncoderVersion
WebPEncodeRGB
WebPEncodeRGBA
WebPEncodeBGR
WebPEncodeBGRA
Java bindings:
To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
is necessary for enum support. The output is intended to be a shared object /
DLL that can be loaded via System.loadLibrary("webp_jni").
Encoding tool:
==============
@@ -84,7 +111,7 @@ decoding (dwebp) images.
The easiest use should look like:
cwebp input.png -q 80 -o output.webp
which will convert the input PNG or JPEG file to a WebP one using a
which will convert the input PNG or JPEG file to a WebP file using a
quality factor of 80 on a 0->100 scale (0 being the lowest quality,
100 being the best. Default value is 75).
@@ -112,14 +139,21 @@ options:
-f <int> ............... filter strength (0=off..100)
-sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
-strong ................ use strong filter instead of simple.
-partition_limit <int> . limit quality to fit the 512k limit on
the first partition (0=no degradation ... 100=full)
-alpha_comp <int> ...... set the transparency-compression
-noalpha ............... discard any transparency information.
-pass <int> ............ analysis pass number (1..10)
-partitions <int> ...... number of partitions to use (0..3)
-crop <x> <y> <w> <h> .. crop picture with the given rectangle
-resize <w> <h> ........ resize picture (after any cropping)
-map <int> ............. print map of extra info.
-d <file.pgm> .......... dump the compressed output (PGM file).
-short ................. condense printed message
-quiet ................. don't print anything.
-version ............... print version number and exit.
-noasm ................. disable all assembly optimizations.
-v ..................... verbose, e.g. print encoding/decoding times
Experimental Options:
@@ -137,7 +171,7 @@ visual quality are:
-m
Namely:
* 'preset' will set up a default encoding configuration targetting a
* 'preset' will set up a default encoding configuration targeting a
particular type of input. It should appear first in the list of options,
so that subsequent options can take effect on top of this preset.
Default value is 'default'.
@@ -161,7 +195,7 @@ Namely:
Decoding tool:
==============
There is a decoding sample code as examples/dwebp.c which will take
There is a decoding sample in examples/dwebp.c which will take
a .webp file and decode it to a PNG image file (amongst other formats).
This is simply to demonstrate the use of the API. You can verify the
file test.webp decodes to exactly the same as test_ref.ppm by using:
@@ -170,9 +204,29 @@ file test.webp decodes to exactly the same as test_ref.ppm by using:
./dwebp test.webp -ppm -o test.ppm
diff test.ppm test_ref.ppm
The full list of options is available using -h:
> dwebp -h
Usage: dwebp in_file [options] [-o out_file]
Decodes the WebP image file to PNG format [Default]
Use following options to convert into alternate image formats:
-ppm ......... save the raw RGB samples as color PPM
-pgm ......... save the raw YUV samples as a grayscale PGM
file with IMC4 layout.
Other options are:
-version .... print version number and exit.
-nofancy ..... don't use the fancy YUV420 upscaler.
-nofilter .... disable in-loop filtering.
-mt .......... use multi-threading
-crop <x> <y> <w> <h> ... crop output with the given rectangle
-scale <w> <h> .......... scale the output (*after* any cropping)
-h ....... this help message.
-v ....... verbose (e.g. print encoding/decoding times)
-noasm ....... disable all assembly optimizations.
Encoding API:
===========
=============
The main encoding functions are available in the header src/webp/encode.h
The ready-to-use ones are:
@@ -188,10 +242,12 @@ size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
They will convert raw RGB samples to a WebP data. The only control supplied
is the quality factor.
Advanced encoding API:
----------------------
A more advanced API is based on the WebPConfig and WebPPicture structures.
WebPConfig contains the encoding settings and is not tied a to a particular
WebPConfig contains the encoding settings and is not tied to a particular
picture.
WebPPicture contains input data, on which some WebPConfig will be used for
compression.
@@ -210,7 +266,7 @@ The encoding flow looks like:
// ... additional tuning
config.sns_strength = 90;
config.filter_sharpness = 6;
config_error = WebPValidateConfig(&config); // not mandartory, but useful
config_error = WebPValidateConfig(&config); // not mandatory, but useful
// Setup the input data
WebPPicture pic;
@@ -223,14 +279,13 @@ The encoding flow looks like:
if (!WebPPictureAllocate(&pic)) {
return 0; // memory error
}
// add that point, 'pic' has been initialized as a container,
// at this point, 'pic' has been initialized as a container,
// and can receive the Y/U/V samples.
// Alternatively, one could use ready-made import functions like
// WebPPictureImportRGB(), which will take care of memory allocation.
// In any case, past this point, one will have to call
// WebPPictureFree(&pic) to reclaim memory.
// Set up a byte-output write method. WebPMemoryWriter, for instance.
WebPMemoryWriter wrt;
pic.writer = MyFileWriter;
@@ -238,14 +293,13 @@ The encoding flow looks like:
// initialize 'wrt' here...
// Compress!
int ok = WebPEncode(&config, &pic); // ok = 0 => error occured!
int ok = WebPEncode(&config, &pic); // ok = 0 => error occurred!
WebPPictureFree(&pic); // must be called independently of the 'ok' result.
// output data should have been handled by the writer at that point.
-------------------------------------- END PSEUDO EXAMPLE
Decoding API:
=============
@@ -256,9 +310,9 @@ uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
int *width, int *height);
Please have a look at the file src/webp/decode.h for the details.
There are variants for decoding in BGR/RGBA/BGRA order, along with decoding to
raw Y'CbCr samples. One can also decode the image directly into a pre-allocated
buffer.
There are variants for decoding in BGR/RGBA/ARGB/BGRA order, along with
decoding to raw Y'CbCr samples. One can also decode the image directly into a
pre-allocated buffer.
To detect a WebP file and gather picture's dimensions, the function:
int WebPGetInfo(const uint8_t* data, uint32_t data_size,
@@ -291,14 +345,14 @@ or by just mentioning the new size of the transmitted data:
WebPIUpdate(idec, buffer, size_of_transmitted_buffer);
Note that 'buffer' can be modified between each calls to WebPIUpdate, in
particular when the buffer is resized to accomodate larger data.
particular when the buffer is resized to accommodate larger data.
These functions will return the decoding status: either VP8_STATUS_SUSPENDED if
decoding is not finished yet, or VP8_STATUS_OK when decoding is done.
Any other status is an error condition.
The idec object must always be released (even upon an error condition)
by calling: WebPDelete(idec)
by calling: WebPDelete(idec).
To retrieve partially decoded picture samples, one must use the corresponding
method: WebPIDecGetRGB or WebPIDecGetYUV.
@@ -310,6 +364,72 @@ WebPINewRGB() or WebPINewYUV().
Please have a look at the src/webp/decode.h header for further details.
Advanced Decoding API:
======================
WebP decoding supports an advanced API which provides on-the-fly cropping and
rescaling, something of great usefulness on memory-constrained environments like
mobile phones. Basically, the memory usage will scale with the output's size,
not the input's, when one only needs a quick preview or a zoomed in portion of
an otherwise too-large picture. Some CPU can be saved too, incidentally.
-------------------------------------- BEGIN PSEUDO EXAMPLE
// A) Init a configuration object
WebPDecoderConfig config;
CHECK(WebPInitDecoderConfig(&config));
// B) optional: retrieve the bitstream's features.
CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
// C) Adjust 'config' options, if needed
config.options.no_fancy_upsampling = 1;
config.options.use_scaling = 1;
config.options.scaled_width = scaledWidth();
config.options.scaled_height = scaledHeight();
// etc.
// D) Specify 'config' output options for specifying output colorspace.
// Optionally the external image decode buffer can also be specified.
config.output.colorspace = MODE_BGRA;
// Optionally, the config.output can be pointed to an external buffer as
// well for decoding the image. This externally supplied memory buffer
// should be big enough to store the decoded picture.
config.output.u.RGBA.rgba = (uint8_t*) memory_buffer;
config.output.u.RGBA.stride = scanline_stride;
config.output.u.RGBA.size = total_size_of_the_memory_buffer;
config.output.is_external_memory = 1;
// E) Decode the WebP image. There are two variants w.r.t decoding image.
// The first one (E.1) decodes the full image and the second one (E.2) is
// used to incrementally decode the image using small input buffers.
// Any one of these steps can be used to decode the WebP image.
// E.1) Decode full image.
CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
// E.2) Decode image incrementally.
WebPIDecoder* const idec = WebPIDecode(NULL, NULL, &config);
CHECK(idec != NULL);
while (bytes_remaining > 0) {
VP8StatusCode status = WebPIAppend(idec, input, bytes_read);
if (status == VP8_STATUS_OK || status == VP8_STATUS_SUSPENDED) {
bytes_remaining -= bytes_read;
} else {
break;
}
}
WebPIDelete(idec);
// F) Decoded image is now in config.output (and config.output.u.RGBA).
// It can be saved, displayed or otherwise processed.
// G) Reclaim memory allocated in config's object. It's safe to call
// this function even if the memory is external and wasn't allocated
// by WebPDecode().
WebPFreeDecBuffer(&config.output);
-------------------------------------- END PSEUDO EXAMPLE
Bugs:
=====
@@ -322,3 +442,4 @@ Discuss:
========
Email: webp-discuss@webmproject.org
Web: http://groups.google.com/a/webmproject.org/group/webp-discuss

View File

@@ -1,4 +1,7 @@
AC_INIT([webpdecode], [0.1])
AC_INIT([libwebp], [0.1.3],
[http://code.google.com/p/webp/issues],,
[http://code.google.com/speed/webp])
AC_CANONICAL_TARGET
AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])
AC_PROG_LIBTOOL
AM_PROG_CC_C_O
@@ -8,6 +11,15 @@ AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=PATH],
[pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
AC_SUBST([pkgconfigdir])
dnl === Check libz is present
if test "$enable_experimental" = "yes"; then
AC_CHECK_HEADER(zlib.h,
AC_CHECK_LIB(z, gzsetparams,,AC_MSG_ERROR(zlib library not found)),
AC_MSG_ERROR(zlib not available - no zlib.h)
)
fi
dnl === check for PNG support ===
PNG_INCLUDES=""
@@ -28,6 +40,11 @@ AC_ARG_WITH(pnglibdir,
[--with-pnglibdir=DIR use PNG libraries from DIR],
[PNG_LIBS="-L$withval"])
SAVED_CPPFLAGS=$CPPFLAGS
SAVED_LIBS=$LIBS
CPPFLAGS="$PNG_INCLUDES $CPPFLAGS"
LIBS="$PNG_LIBS $LIBS"
AC_CHECK_HEADER(png.h,
AC_CHECK_LIB(png, main,
[PNG_LIBS="$PNG_LIBS -lpng"
@@ -41,6 +58,9 @@ AC_CHECK_HEADER(png.h,
AC_SUBST(PNG_LIBS)
AC_SUBST(PNG_INCLUDES)
CPPFLAGS=$SAVED_CPPFLAGS
LIBS=$SAVED_LIBS
dnl === check for JPEG support ===
JPEG_INCLUDES=""
@@ -52,6 +72,11 @@ AC_ARG_WITH(jpeglibdir,
[--with-jpeglibdir=DIR use JPEG libraries from DIR],
[JPEG_LIBS="-L$withval"])
SAVED_CPPFLAGS=$CPPFLAGS
SAVED_LIBS=$LIBS
CPPFLAGS="$JPEG_INCLUDES $CPPFLAGS"
LIBS="$JPEG_LIBS $LIBS"
AC_CHECK_HEADER(jpeglib.h,
AC_CHECK_LIB(jpeg, jpeg_set_defaults,
[JPEG_LIBS="$JPEG_LIBS -ljpeg"
@@ -65,9 +90,73 @@ AC_CHECK_HEADER(jpeglib.h,
AC_SUBST(JPEG_LIBS)
AC_SUBST(JPEG_INCLUDES)
CPPFLAGS=$SAVED_CPPFLAGS
LIBS=$SAVED_LIBS
dnl === check for WIC support ===
if test "$target_os" = "mingw32"; then
AC_CHECK_HEADERS([wincodec.h shlwapi.h windows.h])
if test "$ac_cv_header_wincodec_h" = "yes"; then
AC_MSG_CHECKING(for Windows Imaging Component support)
SAVED_LIBS=$LIBS
LIBS="-lshlwapi -lole32 $LIBS"
# match include structure from [cd]webp.c
wic_headers="
#define INITGUID
#define CINTERFACE
#define COBJMACROS
#define _WIN32_IE 0x500
#include <shlwapi.h>
#include <windows.h>
#include <wincodec.h>
"
# test for functions from each lib and the GUID is created properly
wic_main="
int main(void) {
CLSID_WICImagingFactory;
CoInitialize(NULL);
SHCreateStreamOnFile(NULL, 0, NULL);
return 0;
}
"
AC_LANG_PUSH(C)
AC_LINK_IFELSE(
[AC_LANG_SOURCE([
$wic_headers
$wic_main])],
[wic_support=yes],
[wic_support=no]
)
AC_LANG_POP
test "$wic_support" = "yes" || LIBS=$SAVED_LIBS
AC_MSG_RESULT(${wic_support-no})
fi
fi
dnl === If --enable-experimental is defined, add the flag WEBP_EXPERIMENTAL_FEATURES
USE_EXPERIMENTAL_CODE=""
AC_MSG_CHECKING(if --enable-experimental option is specified)
AC_ARG_ENABLE(experimental, [ --enable-experimental Activate experimental features])
if test "$enable_experimental" = "yes"; then
AC_DEFINE(EXPERIMENTAL,,[Enable experimental code])
USE_EXPERIMENTAL_CODE="-DWEBP_EXPERIMENTAL_FEATURES"
fi
AC_MSG_RESULT(${enable_experimental-no})
AC_SUBST(USE_EXPERIMENTAL_CODE)
dnl =========================
AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES([Makefile src/Makefile man/Makefile examples/Makefile src/dec/Makefile src/enc/Makefile src/libwebp.pc])
AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
examples/Makefile src/dec/Makefile \
src/enc/Makefile src/dsp/Makefile \
src/utils/Makefile \
src/libwebp.pc])
AC_OUTPUT

View File

@@ -3,9 +3,9 @@ AM_CPPFLAGS = -I$(top_srcdir)/src
bin_PROGRAMS = dwebp cwebp
dwebp_SOURCES = dwebp.c stopwatch.h
dwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES)
dwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES) $(USE_EXPERIMENTAL_CODE)
dwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)
cwebp_SOURCES = cwebp.c stopwatch.h
cwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES)
cwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES) $(USE_EXPERIMENTAL_CODE)
cwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)

View File

@@ -14,6 +14,10 @@
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef WEBP_HAVE_PNG
#include <png.h>
#endif
@@ -23,7 +27,10 @@
#include <jpeglib.h>
#endif
#ifdef _WIN32
#ifdef HAVE_WINCODEC_H
#ifdef __MINGW32__
#define INITGUID // Without this GUIDs are declared extern and fail to link
#endif
#define CINTERFACE
#define COBJMACROS
#define _WIN32_IE 0x500 // Workaround bug in shlwapi.h when compiling C++
@@ -31,13 +38,26 @@
#include <shlwapi.h>
#include <windows.h>
#include <wincodec.h>
#ifndef GUID_WICPixelFormat24bppRGB
// From Microsoft SDK 7.0a
DEFINE_GUID(GUID_WICPixelFormat24bppRGB,
0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0d);
#endif
#ifndef GUID_WICPixelFormat32bppRGBA
DEFINE_GUID(GUID_WICPixelFormat32bppRGBA,
0xf5c7ad2d, 0x6a8d, 0x43dd, 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
#endif
#endif /* HAVE_WINCODEC_H */
#include "webp/encode.h"
#include "stopwatch.h"
#ifndef WEBP_DLL
extern void* VP8GetCPUInfo; // opaque forward declaration.
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static int verbose = 0;
@@ -68,7 +88,7 @@ static int ReadYUV(FILE* in_file, WebPPicture* const pic) {
return ok;
}
#ifdef _WIN32
#ifdef HAVE_WINCODEC_H
#define IFS(fn) \
do { \
@@ -95,7 +115,7 @@ static HRESULT OpenInputStream(const char* filename, IStream** ppStream) {
}
static HRESULT ReadPictureWithWIC(const char* filename,
WebPPicture* const pic) {
WebPPicture* const pic, int keep_alpha) {
HRESULT hr = S_OK;
IWICBitmapFrameDecode* pFrame = NULL;
IWICFormatConverter* pConverter = NULL;
@@ -105,6 +125,15 @@ static HRESULT ReadPictureWithWIC(const char* filename,
UINT frameCount = 0;
UINT width, height = 0;
BYTE* rgb = NULL;
WICPixelFormatGUID srcPixelFormat = { 0 };
GUID srcContainerFormat = { 0 };
const GUID* alphaContainers[] = {
&GUID_ContainerFormatBmp,
&GUID_ContainerFormatPng,
&GUID_ContainerFormatTiff
};
int has_alpha = 0;
int i, stride;
IFS(CoInitialize(NULL));
IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
@@ -125,28 +154,53 @@ static HRESULT ReadPictureWithWIC(const char* filename,
hr = E_FAIL;
}
IFS(IWICBitmapDecoder_GetFrame(pDecoder, 0, &pFrame));
IFS(IWICBitmapFrameDecode_GetPixelFormat(pFrame, &srcPixelFormat));
IFS(IWICBitmapDecoder_GetContainerFormat(pDecoder, &srcContainerFormat));
has_alpha = keep_alpha;
for (i = 0;
has_alpha && i < sizeof(alphaContainers)/sizeof(alphaContainers[0]);
++i) {
if (IsEqualGUID(&srcContainerFormat, alphaContainers[i])) {
has_alpha =
IsEqualGUID(&srcPixelFormat, &GUID_WICPixelFormat32bppRGBA) ||
IsEqualGUID(&srcPixelFormat, &GUID_WICPixelFormat32bppBGRA);
break;
}
}
// Prepare for pixel format conversion (if necessary).
IFS(IWICImagingFactory_CreateFormatConverter(pFactory, &pConverter));
IFS(IWICFormatConverter_Initialize(pConverter, (IWICBitmapSource*)pFrame,
MAKE_REFGUID(GUID_WICPixelFormat24bppRGB), WICBitmapDitherTypeNone,
has_alpha ? MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA)
: MAKE_REFGUID(GUID_WICPixelFormat24bppRGB),
WICBitmapDitherTypeNone,
NULL, 0.0, WICBitmapPaletteTypeCustom));
// Decode.
IFS(IWICFormatConverter_GetSize(pConverter, &width, &height));
stride = (has_alpha ? 4 : 3) * width * sizeof(*rgb);
if (SUCCEEDED(hr)) {
rgb = (BYTE*)malloc(3 * width * height);
rgb = (BYTE*)malloc(stride * height);
if (rgb == NULL)
hr = E_OUTOFMEMORY;
}
IFS(IWICFormatConverter_CopyPixels(pConverter, NULL, 3 * width,
3 * width * height, rgb));
IFS(IWICFormatConverter_CopyPixels(pConverter, NULL, stride,
stride * height, rgb));
// WebP conversion.
if (SUCCEEDED(hr)) {
int ok;
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (has_alpha) {
pic->colorspace |= WEBP_CSP_ALPHA_BIT;
}
#endif
pic->width = width;
pic->height = height;
if (!WebPPictureImportRGB(pic, rgb, 3 * width))
ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
: WebPPictureImportRGB(pic, rgb, stride);
if (!ok)
hr = E_FAIL;
}
@@ -160,7 +214,8 @@ static HRESULT ReadPictureWithWIC(const char* filename,
return hr;
}
static int ReadPicture(const char* const filename, WebPPicture* const pic) {
static int ReadPicture(const char* const filename, WebPPicture* const pic,
int keep_alpha) {
int ok;
if (pic->width != 0 && pic->height != 0) {
// If image size is specified, infer it as YUV format.
@@ -173,7 +228,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
fclose(in_file);
} else {
// If no size specified, try to decode it using WIC.
ok = SUCCEEDED(ReadPictureWithWIC(filename, pic));
ok = SUCCEEDED(ReadPictureWithWIC(filename, pic, keep_alpha));
}
if (!ok) {
fprintf(stderr, "Error! Could not process file %s\n", filename);
@@ -181,7 +236,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
return ok;
}
#else // !_WIN32
#else // !HAVE_WINCODEC_H
#ifdef WEBP_HAVE_JPEG
struct my_error_mgr {
@@ -268,6 +323,8 @@ static int ReadJPEG(FILE* in_file, WebPPicture* const pic) {
#else
static int ReadJPEG(FILE* in_file, WebPPicture* const pic) {
(void)in_file;
(void)pic;
printf("JPEG support not compiled. Please install the libjpeg development "
"package before building.\n");
return 0;
@@ -280,10 +337,11 @@ static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
longjmp(png_jmpbuf(png), 1);
}
static int ReadPNG(FILE* in_file, WebPPicture* const pic) {
static int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha) {
png_structp png;
png_infop info;
int color_type, bit_depth, interlaced;
int has_alpha;
int num_passes;
int p;
int ok = 0;
@@ -324,13 +382,24 @@ static int ReadPNG(FILE* in_file, WebPPicture* const pic) {
}
if (png_get_valid(png, info, PNG_INFO_tRNS)) {
png_set_tRNS_to_alpha(png);
has_alpha = 1;
} else {
has_alpha = !!(color_type & PNG_COLOR_MASK_ALPHA);
}
// TODO(skal): Strip Alpha for now (till Alpha is supported).
png_set_strip_alpha(png);
if (!keep_alpha) {
png_set_strip_alpha(png);
has_alpha = 0;
}
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (has_alpha) {
pic->colorspace |= WEBP_CSP_ALPHA_BIT;
}
#endif
num_passes = png_set_interlace_handling(png);
png_read_update_info(png, info);
stride = 3 * width * sizeof(*rgb);
stride = (has_alpha ? 4 : 3) * width * sizeof(*rgb);
rgb = (uint8_t*)malloc(stride * height);
if (rgb == NULL) goto Error;
for (p = 0; p < num_passes; ++p) {
@@ -344,14 +413,18 @@ static int ReadPNG(FILE* in_file, WebPPicture* const pic) {
pic->width = width;
pic->height = height;
ok = WebPPictureImportRGB(pic, rgb, stride);
ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
: WebPPictureImportRGB(pic, rgb, stride);
free(rgb);
End:
return ok;
}
#else
static int ReadPNG(FILE* in_file, WebPPicture* const pic) {
static int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha) {
(void)in_file;
(void)pic;
(void)keep_alpha;
printf("PNG support not compiled. Please install the libpng development "
"package before building.\n");
return 0;
@@ -383,7 +456,8 @@ static InputFileFormat GetImageType(FILE* in_file) {
return format;
}
static int ReadPicture(const char* const filename, WebPPicture* const pic) {
static int ReadPicture(const char* const filename, WebPPicture* const pic,
int keep_alpha) {
int ok = 0;
FILE* in_file = fopen(filename, "rb");
if (in_file == NULL) {
@@ -395,7 +469,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
// If no size specified, try to decode it as PNG/JPEG (as appropriate).
const InputFileFormat format = GetImageType(in_file);
if (format == PNG) {
ok = ReadPNG(in_file, pic);
ok = ReadPNG(in_file, pic, keep_alpha);
} else if (format == JPEG) {
ok = ReadJPEG(in_file, pic);
}
@@ -411,7 +485,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
return ok;
}
#endif // !_WIN32
#endif // !HAVE_WINCODEC_H
static void AllocExtraInfo(WebPPicture* const pic) {
const int mb_w = (pic->width + 15) / 16;
@@ -473,6 +547,14 @@ static void PrintExtraInfo(const WebPPicture* const pic, int short_output) {
100.f * stats->header_bytes[0] / stats->coded_size,
stats->header_bytes[1],
100.f * stats->header_bytes[1] / stats->coded_size);
if (stats->alpha_data_size) {
fprintf(stderr, " transparency: %6d\n",
stats->alpha_data_size);
}
if (stats->layer_data_size) {
fprintf(stderr, " enhancement: %6d\n",
stats->layer_data_size);
}
fprintf(stderr, " Residuals bytes "
"|segment 1|segment 2|segment 3"
"|segment 4| total\n");
@@ -519,7 +601,7 @@ static void PrintExtraInfo(const WebPPicture* const pic, int short_output) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static int MyWriter(const uint8_t* data, size_t data_size,
const WebPPicture* const pic) {
@@ -533,7 +615,8 @@ static int DumpPicture(const WebPPicture* const picture, const char* PGM_name) {
const int uv_width = (picture->width + 1) / 2;
const int uv_height = (picture->height + 1) / 2;
const int stride = (picture->width + 1) & ~1;
const int height = picture->height + uv_height;
const int alpha_height = picture->a ? picture->height : 0;
const int height = picture->height + uv_height + alpha_height;
FILE* const f = fopen(PGM_name, "wb");
if (!f) return 0;
fprintf(f, "P5\n%d %d\n255\n", stride, height);
@@ -548,11 +631,16 @@ static int DumpPicture(const WebPPicture* const picture, const char* PGM_name) {
if (fwrite(picture->v + y * picture->uv_stride, uv_width, 1, f) != 1)
return 0;
}
for (y = 0; y < alpha_height; ++y) {
if (fwrite(picture->a + y * picture->a_stride, picture->width, 1, f) != 1)
return 0;
if (picture->width & 1) fputc(0, f); // pad
}
fclose(f);
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static void HelpShort(void) {
printf("Usage:\n\n");
@@ -567,7 +655,7 @@ static void HelpLong(void) {
printf(" cwebp [-preset <...>] [options] in_file [-o out_file]\n\n");
printf("If input size (-s) for an image is not specified, "
"it is assumed to be a PNG or JPEG file.\n");
#ifdef _WIN32
#ifdef HAVE_WINCODEC_H
printf("Windows builds can take as input any of the files handled by WIC\n");
#endif
printf("options:\n");
@@ -581,6 +669,8 @@ static void HelpLong(void) {
printf("\n");
printf(" -m <int> ............... compression method (0=fast, 6=slowest)\n");
printf(" -segments <int> ........ number of segments to use (1..4)\n");
printf(" -size <int> ............ Target size (in bytes)\n");
printf(" -psnr <float> .......... Target PSNR (in dB. typically: 42)\n");
printf("\n");
printf(" -s <int> <int> ......... Input size (width x height) for YUV\n");
printf(" -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)\n");
@@ -588,26 +678,60 @@ static void HelpLong(void) {
printf(" -sharpness <int> ....... "
"filter sharpness (0:most .. 7:least sharp)\n");
printf(" -strong ................ use strong filter instead of simple.\n");
printf(" -partition_limit <int> . limit quality to fit the 512k limit on\n");
printf(" "
"the first partition (0=no degradation ... 100=full)\n");
#ifdef WEBP_EXPERIMENTAL_FEATURES
printf(" -alpha_comp <int> ...... set the transparency-compression\n");
printf(" -noalpha ............... discard any transparency information.\n");
#endif
printf(" -pass <int> ............ analysis pass number (1..10)\n");
printf(" -crop <x> <y> <w> <h> .. crop picture with the given rectangle\n");
printf(" -resize <w> <h> ........ resize picture (after any cropping)\n");
#ifdef WEBP_EXPERIMENTAL_FEATURES
printf(" -444 / -422 / -gray ..... Change colorspace\n");
#endif
printf(" -map <int> ............. print map of extra info.\n");
printf(" -d <file.pgm> .......... dump the compressed output (PGM file).\n");
printf("\n");
printf(" -short ................. condense printed message\n");
printf(" -quiet ................. don't print anything.\n");
printf(" -version ............... print version number and exit.\n");
#ifndef WEBP_DLL
printf(" -noasm ................. disable all assembly optimizations.\n");
#endif
printf(" -v ..................... verbose, e.g. print encoding/decoding "
"times\n");
printf("\n");
printf("Experimental Options:\n");
printf(" -size <int> ............ Target size (in bytes)\n");
printf(" -psnr <float> .......... Target PSNR (in dB. typically: 42)\n");
printf(" -af .................... auto-adjust filter strength.\n");
printf(" -pre <int> ............. pre-processing filter\n");
printf("\n");
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Error messages
static const char* const kErrorMessages[] = {
"OK",
"OUT_OF_MEMORY: Out of memory allocating objects",
"BITSTREAM_OUT_OF_MEMORY: Out of memory re-allocating byte buffer",
"NULL_PARAMETER: NULL parameter passed to function",
"INVALID_CONFIGURATION: configuration is invalid",
"BAD_DIMENSION: Bad picture dimension. Maximum width and height "
"allowed is 16383 pixels.",
"PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k.\n"
"To reduce the size of this partition, try using less segments "
"with the -segments option, and eventually reduce the number of "
"header bits using -partition_limit. More details are available "
"in the manual (`man cwebp`)",
"PARTITION_OVERFLOW: Partition is too big to fit 16M",
"BAD_WRITE: Picture writer returned an I/O error"
"FILE_TOO_BIG: File would be too big to fit in 4G"
};
//------------------------------------------------------------------------------
int main(int argc, const char *argv[]) {
const char *in_file = NULL, *out_file = NULL, *dump_file = NULL;
@@ -615,12 +739,18 @@ int main(int argc, const char *argv[]) {
int c;
int short_output = 0;
int quiet = 0;
int keep_alpha = 0;
int crop = 0, crop_x = 0, crop_y = 0, crop_w = 0, crop_h = 0;
int resize_w = 0, resize_h = 0;
WebPPicture picture;
WebPConfig config;
WebPAuxStats stats;
Stopwatch stop_watch;
#ifdef WEBP_EXPERIMENTAL_FEATURES
keep_alpha = 1;
#endif
if (!WebPPictureInit(&picture) || !WebPConfigInit(&config)) {
fprintf(stderr, "Error! Version mismatch!\n");
goto Error;
@@ -651,18 +781,18 @@ int main(int argc, const char *argv[]) {
} else if (!strcmp(argv[c], "-m") && c < argc - 1) {
config.method = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-q") && c < argc - 1) {
config.quality = strtod(argv[++c], NULL);
config.quality = (float)strtod(argv[++c], NULL);
} else if (!strcmp(argv[c], "-size") && c < argc - 1) {
config.target_size = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-psnr") && c < argc - 1) {
config.target_PSNR = strtod(argv[++c], NULL);
config.target_PSNR = (float)strtod(argv[++c], NULL);
} else if (!strcmp(argv[c], "-sns") && c < argc - 1) {
config.sns_strength = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-f") && c < argc - 1) {
config.filter_strength = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-af")) {
config.autofilter = 1;
} else if (!strcmp(argv[c], "-strong") && c < argc - 1) {
} else if (!strcmp(argv[c], "-strong")) {
config.filter_type = 1;
} else if (!strcmp(argv[c], "-sharpness") && c < argc - 1) {
config.filter_sharpness = strtol(argv[++c], NULL, 0);
@@ -672,14 +802,37 @@ int main(int argc, const char *argv[]) {
config.preprocessing = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-segments") && c < argc - 1) {
config.segments = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-partition_limit") && c < argc - 1) {
config.partition_limit = strtol(argv[++c], NULL, 0);
#ifdef WEBP_EXPERIMENTAL_FEATURES
} else if (!strcmp(argv[c], "-alpha_comp") && c < argc - 1) {
config.alpha_compression = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-noalpha")) {
keep_alpha = 0;
#endif
} else if (!strcmp(argv[c], "-map") && c < argc - 1) {
picture.extra_info_type = strtol(argv[++c], NULL, 0);
#ifdef WEBP_EXPERIMENTAL_FEATURES
} else if (!strcmp(argv[c], "-444")) {
picture.colorspace = WEBP_YUV444;
} else if (!strcmp(argv[c], "-422")) {
picture.colorspace = WEBP_YUV422;
} else if (!strcmp(argv[c], "-gray")) {
picture.colorspace = WEBP_YUV400;
#endif
} else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
crop = 1;
crop_x = strtol(argv[++c], NULL, 0);
crop_y = strtol(argv[++c], NULL, 0);
crop_w = strtol(argv[++c], NULL, 0);
crop_h = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-resize") && c < argc - 2) {
resize_w = strtol(argv[++c], NULL, 0);
resize_h = strtol(argv[++c], NULL, 0);
#ifndef WEBP_DLL
} else if (!strcmp(argv[c], "-noasm")) {
VP8GetCPUInfo = NULL;
#endif
} else if (!strcmp(argv[c], "-version")) {
const int version = WebPGetEncoderVersion();
printf("%d.%d.%d\n",
@@ -727,9 +880,10 @@ int main(int argc, const char *argv[]) {
}
// Read the input
if (verbose)
if (verbose) {
StopwatchReadAndReset(&stop_watch);
if (!ReadPicture(in_file, &picture)) {
}
if (!ReadPicture(in_file, &picture, keep_alpha)) {
fprintf(stderr, "Error! Cannot read input picture\n");
goto Error;
}
@@ -761,15 +915,26 @@ int main(int argc, const char *argv[]) {
picture.stats = &stats;
// Compress
if (verbose)
if (verbose) {
StopwatchReadAndReset(&stop_watch);
}
if (crop != 0 && !WebPPictureCrop(&picture, crop_x, crop_y, crop_w, crop_h)) {
fprintf(stderr, "Error! Cannot crop picture\n");
goto Error;
}
if (picture.extra_info_type > 0) AllocExtraInfo(&picture);
if ((resize_w | resize_h) > 0) {
if (!WebPPictureRescale(&picture, resize_w, resize_h)) {
fprintf(stderr, "Error! Cannot resize picture\n");
goto Error;
}
}
if (picture.extra_info_type > 0) {
AllocExtraInfo(&picture);
}
if (!WebPEncode(&config, &picture)) {
fprintf(stderr, "Error! Cannot encode picture as WebP\n");
fprintf(stderr, "Error code: %d (%s)\n",
picture.error_code, kErrorMessages[picture.error_code]);
goto Error;
}
if (verbose) {
@@ -778,8 +943,12 @@ int main(int argc, const char *argv[]) {
}
// Write info
if (dump_file) DumpPicture(&picture, dump_file);
if (!quiet) PrintExtraInfo(&picture, short_output);
if (dump_file) {
DumpPicture(&picture, dump_file);
}
if (!quiet) {
PrintExtraInfo(&picture, short_output);
}
Error:
free(picture.extra_info);
@@ -791,4 +960,4 @@ int main(int argc, const char *argv[]) {
return 0;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------

View File

@@ -5,8 +5,7 @@
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// simple command-line example calling libwebpdecode to
// decode a WebP image into a PPM image.
// Command-line tool for decoding a WebP image
//
// Compile with: gcc -o dwebp dwebp.c -lwebpdecode
//
@@ -17,11 +16,18 @@
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifdef WEBP_HAVE_PNG
#include <png.h>
#endif
#ifdef _WIN32
#ifdef HAVE_WINCODEC_H
#ifdef __MINGW32__
#define INITGUID // Without this GUIDs are declared extern and fail to link
#endif
#define CINTERFACE
#define COBJMACROS
#define _WIN32_IE 0x500 // Workaround bug in shlwapi.h when compiling C++
@@ -38,11 +44,22 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
static int verbose = 0;
#ifndef WEBP_DLL
extern void* VP8GetCPUInfo; // opaque forward declaration.
#endif
#ifdef _WIN32
//------------------------------------------------------------------------------
// Output types
typedef enum {
PNG = 0,
PPM,
PGM,
ALPHA_PLANE_ONLY // this is for experimenting only
} OutputFileFormat;
#ifdef HAVE_WINCODEC_H
#define IFS(fn) \
do { \
@@ -60,7 +77,8 @@ static int verbose = 0;
#define MAKE_REFGUID(x) &(x)
#endif
static HRESULT CreateOutputStream(const char* out_file_name, IStream** ppStream) {
static HRESULT CreateOutputStream(const char* out_file_name,
IStream** ppStream) {
HRESULT hr = S_OK;
IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, ppStream));
if (FAILED(hr))
@@ -70,13 +88,14 @@ static HRESULT CreateOutputStream(const char* out_file_name, IStream** ppStream)
static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
unsigned char* rgb, int stride,
uint32_t width, uint32_t height) {
uint32_t width, uint32_t height, int has_alpha) {
HRESULT hr = S_OK;
IWICImagingFactory* pFactory = NULL;
IWICBitmapFrameEncode* pFrame = NULL;
IWICBitmapEncoder* pEncoder = NULL;
IStream* pStream = NULL;
GUID pixel_format = GUID_WICPixelFormat24bppBGR;
WICPixelFormatGUID pixel_format = has_alpha ? GUID_WICPixelFormat32bppBGRA
: GUID_WICPixelFormat24bppBGR;
IFS(CoInitialize(NULL));
IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
@@ -108,21 +127,31 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
return hr;
}
static int WritePNG(const char* out_file_name, unsigned char* rgb, int stride,
uint32_t width, uint32_t height) {
static int WritePNG(const char* out_file_name,
const WebPDecBuffer* const buffer) {
const uint32_t width = buffer->width;
const uint32_t height = buffer->height;
unsigned char* const rgb = buffer->u.RGBA.rgba;
const int stride = buffer->u.RGBA.stride;
const int has_alpha = (buffer->colorspace == MODE_BGRA);
return SUCCEEDED(WriteUsingWIC(out_file_name,
MAKE_REFGUID(GUID_ContainerFormatPng), rgb, stride, width,
height));
height, has_alpha));
}
#elif defined(WEBP_HAVE_PNG) // !WIN32
#elif defined(WEBP_HAVE_PNG) // !HAVE_WINCODEC_H
static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
(void)dummy; // remove variable-unused warning
longjmp(png_jmpbuf(png), 1);
}
static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
png_uint_32 width, png_uint_32 height) {
static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
const uint32_t width = buffer->width;
const uint32_t height = buffer->height;
unsigned char* const rgb = buffer->u.RGBA.rgba;
const int stride = buffer->u.RGBA.stride;
const int has_alpha = (buffer->colorspace == MODE_RGBA);
png_structp png;
png_infop info;
png_uint_32 y;
@@ -142,7 +171,8 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
return 0;
}
png_init_io(png, out_file);
png_set_IHDR(png, info, width, height, 8, PNG_COLOR_TYPE_RGB,
png_set_IHDR(png, info, width, height, 8,
has_alpha ? PNG_COLOR_TYPE_RGBA : PNG_COLOR_TYPE_RGB,
PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
PNG_FILTER_TYPE_DEFAULT);
png_write_info(png, info);
@@ -154,12 +184,13 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
png_destroy_write_struct(&png, &info);
return 1;
}
#else // !WIN32 && !WEBP_HAVE_PNG
#else // !HAVE_WINCODEC_H && !WEBP_HAVE_PNG
typedef uint32_t png_uint_32;
static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
png_uint_32 width, png_uint_32 height) {
static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
(void)out_file;
(void)buffer;
printf("PNG support not compiled. Please install the libpng development "
"package before building.\n");
printf("You can run with -ppm flag to decode in PPM format.\n");
@@ -167,71 +198,172 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
}
#endif
static int WritePPM(FILE* fout, unsigned char* rgb,
uint32_t width, uint32_t height) {
static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer) {
const uint32_t width = buffer->width;
const uint32_t height = buffer->height;
const unsigned char* const rgb = buffer->u.RGBA.rgba;
const int stride = buffer->u.RGBA.stride;
uint32_t y;
fprintf(fout, "P6\n%d %d\n255\n", width, height);
return (fwrite(rgb, width * height, 3, fout) == 3);
for (y = 0; y < height; ++y) {
if (fwrite(rgb + y * stride, width, 3, fout) != 3) {
return 0;
}
}
return 1;
}
static int WritePGM(FILE* fout,
unsigned char* y_plane, unsigned char *u, unsigned char* v,
int y_stride, int uv_stride,
uint32_t width, uint32_t height) {
static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
const uint32_t width = buffer->width;
const uint32_t height = buffer->height;
const unsigned char* const a = buffer->u.YUVA.a;
const int a_stride = buffer->u.YUVA.a_stride;
uint32_t y;
assert(a != NULL);
fprintf(fout, "P5\n%d %d\n255\n", width, height);
for (y = 0; y < height; ++y) {
if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
return 0;
}
}
return 1;
}
static int WritePGM(FILE* fout, const WebPDecBuffer* const buffer) {
const int width = buffer->width;
const int height = buffer->height;
const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
// Save a grayscale PGM file using the IMC4 layout
// (http://www.fourcc.org/yuv.php#IMC4). This is a very
// convenient format for viewing the samples, esp. for
// odd dimensions.
int ok = 1;
unsigned int y;
const unsigned int uv_width = (width + 1) / 2;
const unsigned int uv_height = (height + 1) / 2;
const unsigned int out_stride = (width + 1) & ~1;
fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height);
int y;
const int uv_width = (width + 1) / 2;
const int uv_height = (height + 1) / 2;
const int out_stride = (width + 1) & ~1;
const int a_height = yuv->a ? height : 0;
fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height + a_height);
for (y = 0; ok && y < height; ++y) {
ok &= (fwrite(y_plane + y * y_stride, width, 1, fout) == 1);
ok &= (fwrite(yuv->y + y * yuv->y_stride, width, 1, fout) == 1);
if (width & 1) fputc(0, fout); // padding byte
}
for (y = 0; ok && y < uv_height; ++y) {
ok &= (fwrite(u + y * uv_stride, uv_width, 1, fout) == 1);
ok &= (fwrite(v + y * uv_stride, uv_width, 1, fout) == 1);
ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
}
for (y = 0; ok && y < a_height; ++y) {
ok &= (fwrite(yuv->a + y * yuv->a_stride, width, 1, fout) == 1);
if (width & 1) fputc(0, fout); // padding byte
}
return ok;
}
typedef enum {
PNG = 0,
PPM,
PGM,
} OutputFileFormat;
static void SaveOutput(const WebPDecBuffer* const buffer,
OutputFileFormat format, const char* const out_file) {
FILE* fout = NULL;
int needs_open_file = 1;
int ok = 1;
Stopwatch stop_watch;
if (verbose)
StopwatchReadAndReset(&stop_watch);
#ifdef HAVE_WINCODEC_H
needs_open_file = (format != PNG);
#endif
if (needs_open_file) {
fout = fopen(out_file, "wb");
if (!fout) {
fprintf(stderr, "Error opening output file %s\n", out_file);
return;
}
}
if (format == PNG) {
#ifdef HAVE_WINCODEC_H
ok &= WritePNG(out_file, buffer);
#else
ok &= WritePNG(fout, buffer);
#endif
} else if (format == PPM) {
ok &= WritePPM(fout, buffer);
} else if (format == PGM) {
ok &= WritePGM(fout, buffer);
} else if (format == ALPHA_PLANE_ONLY) {
ok &= WriteAlphaPlane(fout, buffer);
}
if (fout) {
fclose(fout);
}
if (ok) {
printf("Saved file %s\n", out_file);
if (verbose) {
const double time = StopwatchReadAndReset(&stop_watch);
printf("Time to write output: %.3fs\n", time);
}
} else {
fprintf(stderr, "Error writing file %s !!\n", out_file);
}
}
static void Help(void) {
printf("Usage: dwebp "
"[in_file] [-h] [-v] [-ppm] [-pgm] [-version] [-o out_file]\n\n"
printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
"Decodes the WebP image file to PNG format [Default]\n"
"Use following options to convert into alternate image formats:\n"
" -ppm: save the raw RGB samples as color PPM\n"
" -pgm: save the raw YUV samples as a grayscale PGM\n"
" file with IMC4 layout.\n"
" -version: print version number and exit.\n"
"Use -v for verbose (e.g. print encoding/decoding times)\n"
" -ppm ......... save the raw RGB samples as color PPM\n"
" -pgm ......... save the raw YUV samples as a grayscale PGM\n"
" file with IMC4 layout.\n"
" Other options are:\n"
" -version .... print version number and exit.\n"
" -nofancy ..... don't use the fancy YUV420 upscaler.\n"
" -nofilter .... disable in-loop filtering.\n"
" -mt .......... use multi-threading\n"
" -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
" -scale <w> <h> .......... scale the output (*after* any cropping)\n"
#ifdef WEBP_EXPERIMENTAL_FEATURES
" -alpha ....... only save the alpha plane.\n"
#endif
" -h ....... this help message.\n"
" -v ....... verbose (e.g. print encoding/decoding times)\n"
#ifndef WEBP_DLL
" -noasm ....... disable all assembly optimizations.\n"
#endif
);
}
static const char* const kStatusMessages[] = {
"OK", "OUT_OF_MEMORY", "INVALID_PARAM", "BITSTREAM_ERROR",
"UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
};
int main(int argc, const char *argv[]) {
const char *in_file = NULL;
const char *out_file = NULL;
int width, height, stride, uv_stride;
uint8_t* out = NULL, *u = NULL, *v = NULL;
WebPDecoderConfig config;
WebPDecBuffer* const output_buffer = &config.output;
WebPBitstreamFeatures* const bitstream = &config.input;
OutputFileFormat format = PNG;
Stopwatch stop_watch;
int c;
if (!WebPInitDecoderConfig(&config)) {
fprintf(stderr, "Library version mismatch!\n");
return -1;
}
for (c = 1; c < argc; ++c) {
if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
Help();
return 0;
} else if (!strcmp(argv[c], "-o") && c < argc - 1) {
out_file = argv[++c];
} else if (!strcmp(argv[c], "-alpha")) {
format = ALPHA_PLANE_ONLY;
} else if (!strcmp(argv[c], "-nofancy")) {
config.options.no_fancy_upsampling = 1;
} else if (!strcmp(argv[c], "-nofilter")) {
config.options.bypass_filtering = 1;
} else if (!strcmp(argv[c], "-ppm")) {
format = PPM;
} else if (!strcmp(argv[c], "-version")) {
@@ -241,8 +373,24 @@ int main(int argc, const char *argv[]) {
return 0;
} else if (!strcmp(argv[c], "-pgm")) {
format = PGM;
} else if (!strcmp(argv[c], "-mt")) {
config.options.use_threads = 1;
} else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
config.options.use_cropping = 1;
config.options.crop_left = strtol(argv[++c], NULL, 0);
config.options.crop_top = strtol(argv[++c], NULL, 0);
config.options.crop_width = strtol(argv[++c], NULL, 0);
config.options.crop_height = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
config.options.use_scaling = 1;
config.options.scaled_width = strtol(argv[++c], NULL, 0);
config.options.scaled_height = strtol(argv[++c], NULL, 0);
} else if (!strcmp(argv[c], "-v")) {
verbose = 1;
#ifndef WEBP_DLL
} else if (!strcmp(argv[c], "-noasm")) {
VP8GetCPUInfo = NULL;
#endif
} else if (argv[c][0] == '-') {
printf("Unknown option '%s'\n", argv[c]);
Help();
@@ -259,10 +407,13 @@ int main(int argc, const char *argv[]) {
}
{
Stopwatch stop_watch;
VP8StatusCode status = VP8_STATUS_OK;
int ok;
uint32_t data_size = 0;
void* data = NULL;
int ok;
FILE* const in = fopen(in_file, "rb");
if (!in) {
fprintf(stderr, "cannot open input file '%s'\n", in_file);
return 1;
@@ -274,97 +425,74 @@ int main(int argc, const char *argv[]) {
ok = (fread(data, data_size, 1, in) == 1);
fclose(in);
if (!ok) {
fprintf(stderr, "Could not read %d bytes of data from file %s\n",
data_size, in_file);
free(data);
return -1;
}
if (verbose)
StopwatchReadAndReset(&stop_watch);
status = WebPGetFeatures((const uint8_t*)data, data_size, bitstream);
if (status != VP8_STATUS_OK) {
goto end;
}
switch (format) {
case PNG:
#ifdef _WIN32
out = WebPDecodeBGR((const uint8_t*)data, data_size, &width, &height);
#ifdef HAVE_WINCODEC_H
output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
#else
out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
#endif
break;
case PPM:
out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
output_buffer->colorspace = MODE_RGB; // drops alpha for PPM
break;
case PGM:
out = WebPDecodeYUV((const uint8_t*)data, data_size, &width, &height,
&u, &v, &stride, &uv_stride);
output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
break;
case ALPHA_PLANE_ONLY:
output_buffer->colorspace = MODE_YUVA;
break;
default:
free(data);
return -1;
}
status = WebPDecode((const uint8_t*)data, data_size, &config);
if (verbose) {
const double time = StopwatchReadAndReset(&stop_watch);
printf("Time to decode picture: %.3fs\n", time);
}
end:
free(data);
}
if (!out) {
fprintf(stderr, "Decoding of %s failed.\n", in_file);
return -1;
ok = (status == VP8_STATUS_OK);
if (!ok) {
fprintf(stderr, "Decoding of %s failed.\n", in_file);
fprintf(stderr, "Status: %d (%s)\n", status, kStatusMessages[status]);
return -1;
}
}
if (out_file) {
FILE* fout = NULL;
int needs_open_file = 0;
printf("Decoded %s. Dimensions: %d x %d. Now saving...\n", in_file, width, height);
StopwatchReadAndReset(&stop_watch);
#ifdef _WIN32
if (format != PNG) {
needs_open_file = 1;
}
#else
needs_open_file = 1;
#endif
if (needs_open_file) fout = fopen(out_file, "wb");
if (!needs_open_file || fout) {
int ok = 1;
if (format == PNG) {
#ifdef _WIN32
ok &= WritePNG(out_file, out, 3 * width, width, height);
#else
ok &= WritePNG(fout, out, 3 * width, width, height);
#endif
} else if (format == PPM) {
ok &= WritePPM(fout, out, width, height);
} else if (format == PGM) {
ok &= WritePGM(fout, out, u, v, stride, uv_stride, width, height);
}
if (fout)
fclose(fout);
if (ok) {
printf("Saved file %s\n", out_file);
if (verbose) {
const double time = StopwatchReadAndReset(&stop_watch);
printf("Time to write output: %.3fs\n", time);
}
} else {
fprintf(stderr, "Error writing file %s !!\n", out_file);
}
} else {
fprintf(stderr, "Error opening output file %s\n", out_file);
}
printf("Decoded %s. Dimensions: %d x %d%s. Now saving...\n", in_file,
output_buffer->width, output_buffer->height,
bitstream->has_alpha ? " (with alpha)" : "");
SaveOutput(output_buffer, format, out_file);
} else {
printf("File %s can be decoded (dimensions: %d x %d).\n",
in_file, width, height);
printf("File %s can be decoded (dimensions: %d x %d)%s.\n",
in_file, output_buffer->width, output_buffer->height,
bitstream->has_alpha ? " (with alpha)" : "");
printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
}
free(out);
WebPFreeDecBuffer(output_buffer);
return 0;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -30,7 +30,7 @@ static inline double StopwatchReadAndReset(Stopwatch* watch) {
}
#else // !_WIN32
#else /* !_WIN32 */
#include <sys/time.h>
typedef struct timeval Stopwatch;
@@ -42,6 +42,6 @@ static inline double StopwatchReadAndReset(Stopwatch* watch) {
(watch->tv_usec - old_value.tv_usec) / 1000000.0;
}
#endif // !_WIN32
#endif /* _WIN32 */
#endif // WEBP_EXAMPLES_STOPWATCH_H_
#endif /* WEBP_EXAMPLES_STOPWATCH_H_ */

View File

@@ -13,8 +13,8 @@
# These flag assume you have libpng and libjpeg installed. If not, either
# follow below install instructions or just comment out the next lines.
EXTRA_FLAGS= -DWEBP_HAVE_PNG -DWEBP_HAVE_JPEG
EXTRA_LIBS= -lpng -ljpeg
ifeq ("$(HOSTTYPE)", "intel-mac")
EXTRA_LIBS= -lpng -ljpeg -lz
ifeq ($(strip $(shell uname)), Darwin)
EXTRA_FLAGS += -I/opt/local/include
EXTRA_LIBS += -L/opt/local/lib
endif
@@ -33,56 +33,103 @@ endif
# 'make -f makefile.unix EXTRA_FLAGS=-m32' to that effect.
# EXTRA_FLAGS += -m32
# Extra flags to enable experimental features and code
# EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES
# Extra flags to enable multi-threading
EXTRA_FLAGS += -DWEBP_USE_THREAD
EXTRA_LIBS += -lpthread
# Extra flags to emulate C89 strictness with the full ANSI
EXTRA_FLAGS += -Wextra -Wold-style-definition
EXTRA_FLAGS += -Wmissing-prototypes
EXTRA_FLAGS += -Wmissing-declarations
EXTRA_FLAGS += -Wdeclaration-after-statement
# EXTRA_FLAGS += -Wvla
#### Nothing should normally be changed below this line ####
AR = ar
ARFLAGS = r
CC = gcc -Isrc/ -Iexamples/ -Wall
CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
LDFLAGS = src/libwebp.a $(EXTRA_LIBS) -lm
INSTALL = install
LDFLAGS = $(EXTRA_LIBS) -lm
DEC_OBJS = src/dec/frame.o src/dec/webp.o src/dec/quant.o src/dec/tree.o \
src/dec/vp8.o src/dec/idec.o src/dec/alpha.o src/dec/layer.o \
src/dec/io.o src/dec/buffer.o
ENC_OBJS = src/enc/webpenc.o src/enc/syntax.o \
src/enc/alpha.o src/enc/layer.o \
src/enc/tree.o src/enc/config.o src/enc/frame.o \
src/enc/quant.o src/enc/iterator.o src/enc/analysis.o \
src/enc/cost.o src/enc/picture.o src/enc/filter.o
DSP_OBJS = src/dsp/cpu.o src/dsp/enc.o \
src/dsp/enc_sse2.o src/dsp/dec.o src/dsp/dec_sse2.o \
src/dsp/dec_neon.o src/dsp/upsampling.o src/dsp/upsampling_sse2.o \
src/dsp/yuv.o
UTILS_OBJS = src/utils/bit_reader.o src/utils/bit_writer.o src/utils/thread.o
OBJS = $(DEC_OBJS) $(ENC_OBJS) $(DSP_OBJS) $(UTILS_OBJS)
HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/cost.h \
src/dec/vp8i.h \
src/dsp/yuv.h src/dsp/dsp.h \
src/utils/bit_writer.h src/utils/bit_reader.h src/utils/thread.h
OBJS = src/enc/webpenc.o src/enc/bit_writer.o src/enc/syntax.o \
src/enc/dsp.o src/enc/tree.o src/enc/config.o src/enc/frame.o \
src/enc/quant.o src/enc/iterator.o src/enc/analysis.o \
src/enc/cost.o src/enc/picture.o src/enc/filter.o \
src/dec/bits.o src/dec/dsp.o src/dec/frame.o src/dec/webp.o \
src/dec/quant.o src/dec/tree.o src/dec/vp8.o src/dec/yuv.o \
src/dec/idec.o
HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/bit_writer.h \
src/enc/cost.h src/dec/bits.h src/dec/vp8i.h src/dec/yuv.h
OUTPUT = examples/cwebp examples/dwebp src/libwebp.a
all:ex
.c.o: $(HDRS)
%.o: %.c $(HDRS)
$(CC) $(CFLAGS) -c $< -o $@
libwebp.a: $(OBJS) $(HDRS)
ar r src/libwebp.a $(OBJS)
src/libwebp.a: $(OBJS)
$(AR) $(ARFLAGS) $@ $^
ex: examples/cwebp.o examples/dwebp.o libwebp.a
$(CC) -o examples/cwebp examples/cwebp.o $(LDFLAGS)
$(CC) -o examples/dwebp examples/dwebp.o $(LDFLAGS)
ex: examples/cwebp examples/dwebp
examples/cwebp: examples/cwebp.o src/libwebp.a
examples/dwebp: examples/dwebp.o src/libwebp.a
examples/cwebp examples/dwebp:
$(CC) -o $@ $^ $(LDFLAGS)
dist: DESTDIR := dist
dist: all
$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
$(DESTDIR)/doc $(DESTDIR)/lib
$(INSTALL) -m755 -s examples/cwebp examples/dwebp $(DESTDIR)
$(INSTALL) -m644 src/webp/*.h $(DESTDIR)/include/webp
$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
umask 022; \
for m in man/[cd]webp.1; do \
basenam=$$(basename $$m .1); \
/usr/bin/groff -t -e -man -T utf8 $$m \
| col -bx >$(DESTDIR)/doc/$${basenam}.txt; \
/usr/bin/groff -t -e -man -T html $$m \
| col -bx >$(DESTDIR)/doc/$${basenam}.html; \
done
clean:
rm -f ${OUTPUT} *~ \
$(RM) ${OUTPUT} *~ \
src/enc/*.o src/enc/*~ \
src/dec/*.o src/dec/*~ \
src/dsp/*.o src/dsp/*~ \
src/utils/*.o src/utils/*~ \
examples/*.o examples/*~
superclean: clean
rm -rf .git *.log *.cache *~
rm -rf .deps */.deps */*/.deps
rm -rf .libs */.libs */*/.libs
rm -f */*.lo */*/*.lo
rm -f */*.la */*/*.la
rm -f Makefile */Makefile */*/Makefile
rm -f Makefile.in */Makefile.in */*/Makefile.in
rm -f config.log autom4te.cache libtool config.h stamp-h1
rm -f aclocal.m4 compile config.guess config.h.in config.sub config.status
rm -f configure depcomp install-sh ltmain.sh missing src/libwebp.pc
rm -f m4/*
$(RM) -r .git *.log *.cache *~
$(RM) -r .deps */.deps */*/.deps
$(RM) -r .libs */.libs */*/.libs
$(RM) */*.lo */*/*.lo
$(RM) */*.la */*/*.la
$(RM) Makefile */Makefile */*/Makefile
$(RM) Makefile.in */Makefile.in */*/Makefile.in
$(RM) config.log autom4te.cache libtool config.h stamp-h1
$(RM) aclocal.m4 compile config.guess config.h.in config.sub config.status
$(RM) configure depcomp install-sh ltmain.sh missing src/libwebp.pc
$(RM) m4/*
.PHONY: all clean dist ex superclean
.SUFFIXES:

View File

@@ -1,5 +1,5 @@
.\" Hey, EMACS: -*- nroff -*-
.TH CWEBP 1 "March 28, 2011"
.TH CWEBP 1 "September 19, 2011"
.SH NAME
cwebp \- compress an image file to a WebP file
.SH SYNOPSIS
@@ -11,7 +11,7 @@ This manual page documents the
.B cwebp
command.
.PP
\fBcwebp\fP compresses image using the WebP format.
\fBcwebp\fP compresses an image using the WebP format.
Input format can be either PNG, JPEG, or raw Y'CbCr samples.
When using PNG, the transparency information (alpha channel) is currently
discarded.
@@ -32,8 +32,8 @@ A summary of all the possible options.
Print the version number (as major.minor.revision) and exit.
.TP
.B \-q float
Specify the compression factor between 0 and 100. Small factor
produce smaller file with lower quality. Best quality is achieved
Specify the compression factor between 0 and 100. A small factor
produces a smaller file with lower quality. Best quality is achieved
using a value of 100. The default is 75.
.TP
.B \-f int
@@ -47,7 +47,7 @@ appear. Typical values are usually in the range of 20 to 50.
Specify a set of pre-defined parameters to suit a particular type of
source material. Possible values are: \fBdefault\fP, \fBphoto\fP,
\fBpicture\fP, \fBdrawing\fP, \fBicon\fP, \fBtext\fP. Since
\fB\-preset\fP overwrites the other parameter's values (except the
\fB\-preset\fP overwrites the other parameters' values (except the
\fB\-q\fP one), this option should preferably appear first in the
order of the arguments.
.TP
@@ -86,6 +86,25 @@ used thanks to the \fB\-f\fP option). Strong filtering is off by default.
Change the number of partitions to use during the segmentation of the
sns algorithm. Segments should be in range 1 to 4. Default value is 4.
.TP
.B \-partition_limit int
Degrade quality by limiting the number of bits used by some macroblocks.
Range is 0 (no degradation, the default) to 100 (full degradation).
Useful values are usually around 30-70 for moderately large images.
In the VP8 format, the so-called control partition has a limit of 512k and
is used to store the following information: whether the macroblock is skipped,
which segment it belongs to, whether it is coded as intra 4x4 or intra 16x16
mode, and finally the prediction modes to use for each of the sub-blocks.
For a very large image, 512k only leaves room to few bits per 16x16 macroblock.
The absolute minimum is 4 bits per macroblock. Skip, segment, and mode
information can use up almost all these 4 bits (although the case is unlikely),
which is problematic for very large images. The partition_limit factor controls
how frequently the most bit-costly mode (intra 4x4) will be used. This is
useful in case the 512k limit is reached and the following message is displayed:
\fIError code: 6 (PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k)\fP.
If using \fB-partition_limit\fP is not enough to meet the 512k constraint, one
should use less segments in order to save more header bits per macroblock.
See the \fB-segments\fP option.
.TP
.B \-size int
Specify a target size (in bytes) to try and reach for the compressed output.
Compressor will make several pass of partial encoding in order to get as
@@ -102,8 +121,9 @@ options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
.TP
.B \-crop x_position y_position width height
Crop the source to a rectangle with top-left corner at coordinates
(x_position, y_position) and size width x height. This cropping area must
be fully contained within the source rectangle.
(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
This cropping area must be fully contained within the source rectangle.
.TP
.B \-s width height
Specify that the input file actually consists of raw Y'CbCr samples following
the ITU-R BT.601 recommendation, in 4:2:0 linear format.
@@ -117,6 +137,9 @@ range from 1 to 6. This is only meant to help debugging.
Specify a pre-processing filter. This option is a placeholder
and has currently no effect.
.TP
.B \-noasm
Disable all assembly optimizations.
.TP
.B \-v
Print extra information (encoding time in particular).
.TP

View File

@@ -1,7 +1,7 @@
.\" Hey, EMACS: -*- nroff -*-
.TH DWEBP 1 "March 28, 2011"
.TH DWEBP 1 "September 19, 2011"
.SH NAME
dwebp \- compress a WebP file to an image file
dwebp \- decompress a WebP file to an image file
.SH SYNOPSIS
.B dwebp
.RI [ options ] " input_file.webp
@@ -11,7 +11,7 @@ This manual page documents the
.B dwebp
command.
.PP
\fBdwebp\fP decompresses WebP files into PNG or PPM images.
\fBdwebp\fP decompresses WebP files into PNG, PPM or PGM images.
.SH OPTIONS
The basic options are:
.TP
@@ -32,8 +32,37 @@ Change the output format to PGM. The output consist of luma/chroma
samples instead of RGB, using the ICM4 layout. This option is mainly
for verification and debugging purpose.
.TP
.B \-nofancy
Don't use the fancy upscaler for YUV420. This may lead to jaggy
edges (especially the red ones), but should be faster.
.TP
.B \-nofilter
Don't use the in-loop filtering process even if it is required by
the bitstream. This may produce visible blocks on the non-compliant output,
but will make the decoding faster.
.TP
.B \-mt
Use multi-threading for decoding, if possible.
.TP
.B \-crop x_position y_position width height
Crop the decoded picture to a rectangle with top-left corner at coordinates
(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
This cropping area must be fully contained within the source rectangle.
The top-left corner will be snapped to even coordinates if needed.
This option is meant to reduce the memory needed for cropping large images.
Note: the cropping is applied \fIbefore\fP any scaling.
.TP
.B \-scale width height
Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This option is
mostly intended to reducing the memory needed to decode large images,
when only a small version is needed (thumbnail, preview, etc.).
Note: scaling is applied \fIafter\fP cropping.
.TP
.B \-v
Print extra information (decoding time in particular).
.TP
.B \-noasm
Disable all assembly optimizations.
.SH Examples:
dwebp picture.webp -o output.png

View File

@@ -1,12 +1,14 @@
SUBDIRS = dec enc
SUBDIRS = dec enc dsp utils
AM_CPPFLAGS = -I$(top_srcdir)/src
lib_LTLIBRARIES = libwebp.la
libwebp_la_SOURCES =
libwebp_la_LIBADD = dec/libwebpdecode.la \
enc/libwebpencode.la
libwebp_la_LDFLAGS = -version-info 0:0:0
enc/libwebpencode.la \
utils/libwebputils.la \
dsp/libwebpdsp.la
libwebp_la_LDFLAGS = -version-info 2:0:0
libwebpinclude_HEADERS = webp/types.h webp/decode.h webp/decode_vp8.h \
webp/encode.h
libwebpincludedir = $(includedir)/webp

View File

@@ -1,14 +1,13 @@
AM_CPPFLAGS = -I$(top_srcdir)/src
libwebpdecode_la_SOURCES = bits.h vp8i.h yuv.h bits.c dsp.c frame.c \
quant.c tree.c vp8.c webp.c yuv.c idec.c
libwebpdecode_la_LDFLAGS = -version-info 0:0:0
libwebpdecode_la_SOURCES = vp8i.h webpi.h \
frame.c quant.c tree.c vp8.c webp.c \
idec.c alpha.c layer.c io.c buffer.c
libwebpdecode_la_LDFLAGS = -version-info 2:0:0
libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
libwebpdecodeinclude_HEADERS = ../webp/decode.h ../webp/decode_vp8.h ../webp/types.h
libwebpdecodeincludedir = $(includedir)/webp
noinst_HEADERS = bits.h vp8i.h webpi.h yuv.h
noinst_HEADERS = vp8i.h webpi.h
noinst_LTLIBRARIES = libwebpdecode.la
# uncomment the following line (and comment the above) if you want
# to install libwebpdecode library.
#lib_LTLIBRARIES = libwebpdecode.la

69
src/dec/alpha.c Normal file
View File

@@ -0,0 +1,69 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Alpha-plane decompression.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "vp8i.h"
#ifdef WEBP_EXPERIMENTAL_FEATURES
#include "zlib.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
int row, int num_rows) {
uint8_t* output = dec->alpha_plane_;
const int stride = dec->pic_hdr_.width_;
if (row < 0 || row + num_rows > dec->pic_hdr_.height_) {
return NULL; // sanity check
}
if (row == 0) {
// TODO(skal): for now, we just decompress everything during the first call.
// Later, we'll decode progressively, but we need to store the
// z_stream state.
const uint8_t* data = dec->alpha_data_;
size_t data_size = dec->alpha_data_size_;
const size_t output_size = stride * dec->pic_hdr_.height_;
int ret = Z_OK;
z_stream strm;
memset(&strm, 0, sizeof(strm));
if (inflateInit(&strm) != Z_OK) {
return 0;
}
strm.avail_in = data_size;
strm.next_in = (unsigned char*)data;
do {
strm.avail_out = output_size;
strm.next_out = output;
ret = inflate(&strm, Z_NO_FLUSH);
if (ret == Z_NEED_DICT || ret == Z_DATA_ERROR || ret == Z_MEM_ERROR) {
break;
}
} while (strm.avail_out == 0);
inflateEnd(&strm);
if (ret != Z_STREAM_END) {
return NULL; // error
}
}
return output + row * stride;
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // WEBP_EXPERIMENTAL_FEATURES

198
src/dec/buffer.c Normal file
View File

@@ -0,0 +1,198 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Everything about WebPDecBuffer
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "vp8i.h"
#include "webpi.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// WebPDecBuffer
// Number of bytes per pixel for the different color-spaces.
static const int kModeBpp[MODE_LAST] = { 3, 4, 3, 4, 4, 2, 2, 1, 1 };
static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
int ok = 1;
WEBP_CSP_MODE mode = buffer->colorspace;
const int width = buffer->width;
const int height = buffer->height;
if (mode >= MODE_YUV) { // YUV checks
const WebPYUVABuffer* const buf = &buffer->u.YUVA;
const int size = buf->y_stride * height;
const int u_size = buf->u_stride * ((height + 1) / 2);
const int v_size = buf->v_stride * ((height + 1) / 2);
const int a_size = buf->a_stride * height;
ok &= (size <= buf->y_size);
ok &= (u_size <= buf->u_size);
ok &= (v_size <= buf->v_size);
ok &= (a_size <= buf->a_size);
ok &= (buf->y_stride >= width);
ok &= (buf->u_stride >= (width + 1) / 2);
ok &= (buf->v_stride >= (width + 1) / 2);
if (buf->a) {
ok &= (buf->a_stride >= width);
}
} else { // RGB checks
const WebPRGBABuffer* const buf = &buffer->u.RGBA;
ok &= (buf->stride * height <= buf->size);
ok &= (buf->stride >= width * kModeBpp[mode]);
}
return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
}
static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
const int w = buffer->width;
const int h = buffer->height;
if (w <= 0 || h <= 0) {
return VP8_STATUS_INVALID_PARAM;
}
if (!buffer->is_external_memory && buffer->private_memory == NULL) {
uint8_t* output;
WEBP_CSP_MODE mode = buffer->colorspace;
int stride;
int uv_stride = 0, a_stride = 0;
int uv_size = 0;
uint64_t size, a_size = 0, total_size;
// We need memory and it hasn't been allocated yet.
// => initialize output buffer, now that dimensions are known.
stride = w * kModeBpp[mode];
size = (uint64_t)stride * h;
if (mode >= MODE_YUV) {
uv_stride = (w + 1) / 2;
uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
if (mode == MODE_YUVA) {
a_stride = w;
a_size = (uint64_t)a_stride * h;
}
}
total_size = size + 2 * uv_size + a_size;
// Security/sanity checks
if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
return VP8_STATUS_INVALID_PARAM;
}
buffer->private_memory = output = (uint8_t*)malloc((size_t)total_size);
if (output == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
}
if (mode >= MODE_YUV) { // YUVA initialization
WebPYUVABuffer* const buf = &buffer->u.YUVA;
buf->y = output;
buf->y_stride = stride;
buf->y_size = size;
buf->u = output + size;
buf->u_stride = uv_stride;
buf->u_size = uv_size;
buf->v = output + size + uv_size;
buf->v_stride = uv_stride;
buf->v_size = uv_size;
if (mode == MODE_YUVA) {
buf->a = output + size + 2 * uv_size;
}
buf->a_size = a_size;
buf->a_stride = a_stride;
} else { // RGBA initialization
WebPRGBABuffer* const buf = &buffer->u.RGBA;
buf->rgba = output;
buf->stride = stride;
buf->size = size;
}
}
return CheckDecBuffer(buffer);
}
VP8StatusCode WebPAllocateDecBuffer(int w, int h,
const WebPDecoderOptions* const options,
WebPDecBuffer* const out) {
if (out == NULL || w <= 0 || h <= 0) {
return VP8_STATUS_INVALID_PARAM;
}
if (options != NULL) { // First, apply options if there is any.
if (options->use_cropping) {
const int cw = options->crop_width;
const int ch = options->crop_height;
const int x = options->crop_left & ~1;
const int y = options->crop_top & ~1;
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
return VP8_STATUS_INVALID_PARAM; // out of frame boundary.
}
w = cw;
h = ch;
}
if (options->use_scaling) {
if (options->scaled_width <= 0 || options->scaled_height <= 0) {
return VP8_STATUS_INVALID_PARAM;
}
w = options->scaled_width;
h = options->scaled_height;
}
}
out->width = w;
out->height = h;
// Then, allocate buffer for real
return AllocateBuffer(out);
}
//------------------------------------------------------------------------------
// constructors / destructors
int WebPInitDecBufferInternal(WebPDecBuffer* const buffer, int version) {
if (version != WEBP_DECODER_ABI_VERSION) return 0; // version mismatch
if (!buffer) return 0;
memset(buffer, 0, sizeof(*buffer));
return 1;
}
void WebPFreeDecBuffer(WebPDecBuffer* const buffer) {
if (buffer) {
if (!buffer->is_external_memory)
free(buffer->private_memory);
buffer->private_memory = NULL;
}
}
void WebPCopyDecBuffer(const WebPDecBuffer* const src,
WebPDecBuffer* const dst) {
if (src && dst) {
*dst = *src;
if (src->private_memory) {
dst->is_external_memory = 1; // dst buffer doesn't own the memory.
dst->private_memory = NULL;
}
}
}
// Copy and transfer ownership from src to dst (beware of parameter order!)
void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
if (src && dst) {
*dst = *src;
if (src->private_memory) {
src->is_external_memory = 1; // src relinquishes ownership
src->private_memory = NULL;
}
}
}
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

View File

@@ -10,7 +10,7 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "vp8i.h"
#include "./vp8i.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
@@ -18,25 +18,84 @@ extern "C" {
#define ALIGN_MASK (32 - 1)
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
//
// Reason is: the deblocking filter cannot deblock the bottom horizontal edges
// immediately, and needs to wait for first few rows of the next macroblock to
// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
// on strength).
// With two threads, the vertical positions of the rows being decoded are:
// Decode: [ 0..15][16..31][32..47][48..63][64..79][...
// Deblock: [ 0..11][12..27][28..43][44..59][...
// If we use two threads and two caches of 16 pixels, the sequence would be:
// Decode: [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
// Deblock: [ 0..11][12..27!!][-4..11][12..27][...
// The problem occurs during row [12..15!!] that both the decoding and
// deblocking threads are writing simultaneously.
// With 3 cache lines, one get a safe write pattern:
// Decode: [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
// Deblock: [ 0..11][12..27][28..43][-4..11][12..27][28...
// Note that multi-threaded output _without_ deblocking can make use of two
// cache lines of 16 pixels only, since there's no lagging behind. The decoding
// and output process have non-concurrent writing:
// Decode: [ 0..15][16..31][ 0..15][16..31][...
// io->put: [ 0..15][16..31][ 0..15][...
#define MT_CACHE_LINES 3
#define ST_CACHE_LINES 1 // 1 cache row only for single-threaded case
// Initialize multi/single-thread worker
static int InitThreadContext(VP8Decoder* const dec) {
dec->cache_id_ = 0;
if (dec->use_threads_) {
WebPWorker* const worker = &dec->worker_;
if (!WebPWorkerReset(worker)) {
return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
"thread initialization failed.");
}
worker->data1 = dec;
worker->data2 = (void*)&dec->thread_ctx_.io_;
worker->hook = (WebPWorkerHook)VP8FinishRow;
dec->num_caches_ =
(dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
} else {
dec->num_caches_ = ST_CACHE_LINES;
}
return 1;
}
//------------------------------------------------------------------------------
// Memory setup
// how many extra luma lines are needed for caching, given a filtering level
static const uint8_t kFilterExtraRows[3] = { 0, 4, 8 };
// kFilterExtraRows[] = How many extra lines are needed on the MB boundary
// for caching, given a filtering level.
// Simple filter: up to 2 luma samples are read and 1 is written.
// Complex filter: up to 4 luma samples are read and 3 are written. Same for
// U/V, so it's 8 samples total (because of the 2x upsampling).
static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
static int AllocateMemory(VP8Decoder* const dec) {
const int num_caches = dec->num_caches_;
const int mb_w = dec->mb_w_;
const int intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
const int top_size = (16 + 8 + 8) * mb_w;
const int info_size = (mb_w + 1) * sizeof(VP8MB);
const int mb_info_size = (mb_w + 1) * sizeof(VP8MB);
const int f_info_size =
(dec->filter_type_ > 0) ?
mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
: 0;
const int yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
const int coeffs_size = 384 * sizeof(*dec->coeffs_);
const int cache_height = (16 + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
const int cache_height = (16 * num_caches
+ kFilterExtraRows[dec->filter_type_]) * 3 / 2;
const int cache_size = top_size * cache_height;
const int alpha_size =
dec->alpha_data_ ? (dec->pic_hdr_.width_ * dec->pic_hdr_.height_) : 0;
const int needed = intra_pred_mode_size
+ top_size + info_size
+ top_size + mb_info_size + f_info_size
+ yuv_size + coeffs_size
+ cache_size + ALIGN_MASK;
+ cache_size + alpha_size + ALIGN_MASK;
uint8_t* mem;
if (needed > dec->mem_size_) {
@@ -62,7 +121,18 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
mem += 8 * mb_w;
dec->mb_info_ = ((VP8MB*)mem) + 1;
mem += info_size;
mem += mb_info_size;
dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
mem += f_info_size;
dec->thread_ctx_.id_ = 0;
dec->thread_ctx_.f_info_ = dec->f_info_;
if (dec->use_threads_) {
// secondary cache line. The deblocking process need to make use of the
// filtering strength from previous macroblock row, while the new ones
// are being decoded in parallel. We'll just swap the pointers.
dec->thread_ctx_.f_info_ += mb_w;
}
mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
assert((yuv_size & ALIGN_MASK) == 0);
@@ -79,36 +149,48 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
const int extra_y = extra_rows * dec->cache_y_stride_;
const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
dec->cache_y_ = ((uint8_t*)mem) + extra_y;
dec->cache_u_ = dec->cache_y_ + 16 * dec->cache_y_stride_ + extra_uv;
dec->cache_v_ = dec->cache_u_ + 8 * dec->cache_uv_stride_ + extra_uv;
dec->cache_u_ = dec->cache_y_
+ 16 * num_caches * dec->cache_y_stride_ + extra_uv;
dec->cache_v_ = dec->cache_u_
+ 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
dec->cache_id_ = 0;
}
mem += cache_size;
// alpha plane
dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
mem += alpha_size;
// note: left-info is initialized once for all.
memset(dec->mb_info_ - 1, 0, (mb_w + 1) * sizeof(*dec->mb_info_));
memset(dec->mb_info_ - 1, 0, mb_info_size);
// initialize top
memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
return 1;
}
static void InitIo(VP8Decoder* const dec, VP8Io* io) {
// prepare 'io'
io->width = dec->pic_hdr_.width_;
io->height = dec->pic_hdr_.height_;
io->mb_y = 0;
io->y = dec->cache_y_;
io->u = dec->cache_u_;
io->v = dec->cache_v_;
io->y_stride = dec->cache_y_stride_;
io->uv_stride = dec->cache_uv_stride_;
io->fancy_upscaling = 0; // default
// Init critical function pointers and look-up tables.
VP8DspInitTables();
VP8DspInit();
io->fancy_upsampling = 0; // default
io->a = NULL;
}
int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
if (!InitThreadContext(dec)) return 0; // call first. Sets dec->num_caches_.
if (!AllocateMemory(dec)) return 0;
InitIo(dec, io);
VP8DspInit(); // Init critical function pointers and look-up tables.
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Filtering
static inline int hev_thresh_from_level(int level, int keyframe) {
@@ -119,12 +201,13 @@ static inline int hev_thresh_from_level(int level, int keyframe) {
}
}
static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
VP8MB* const mb = dec->mb_info_ + mb_x;
uint8_t* const y_dst = dec->cache_y_ + mb_x * 16;
static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
const int y_bps = dec->cache_y_stride_;
const int level = mb->f_level_;
const int ilevel = mb->f_ilevel_;
VP8FInfo* const f_info = ctx->f_info_ + mb_x;
uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
const int level = f_info->f_level_;
const int ilevel = f_info->f_ilevel_;
const int limit = 2 * level + ilevel;
if (level == 0) {
return;
@@ -133,26 +216,26 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
if (mb_x > 0) {
VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
}
if (mb->f_inner_) {
if (f_info->f_inner_) {
VP8SimpleHFilter16i(y_dst, y_bps, limit);
}
if (mb_y > 0) {
VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
}
if (mb->f_inner_) {
if (f_info->f_inner_) {
VP8SimpleVFilter16i(y_dst, y_bps, limit);
}
} else { // complex
uint8_t* const u_dst = dec->cache_u_ + mb_x * 8;
uint8_t* const v_dst = dec->cache_v_ + mb_x * 8;
const int uv_bps = dec->cache_uv_stride_;
uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
const int hev_thresh =
hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
if (mb_x > 0) {
VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
}
if (mb->f_inner_) {
if (f_info->f_inner_) {
VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
}
@@ -160,16 +243,29 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
}
if (mb->f_inner_) {
if (f_info->f_inner_) {
VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
}
}
}
// Filter the decoded macroblock row (if needed)
static void FilterRow(const VP8Decoder* const dec) {
int mb_x;
const int mb_y = dec->thread_ctx_.mb_y_;
assert(dec->thread_ctx_.filter_row_);
for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
DoFilter(dec, mb_x, mb_y);
}
}
//------------------------------------------------------------------------------
void VP8StoreBlock(VP8Decoder* const dec) {
if (dec->filter_type_ > 0) {
VP8MB* const info = dec->mb_info_ + dec->mb_x_;
VP8FInfo* const info = dec->f_info_ + dec->mb_x_;
const int skip = dec->mb_info_[dec->mb_x_].skip_;
int level = dec->filter_levels_[dec->segment_];
if (dec->filter_hdr_.use_lf_delta_) {
// TODO(skal): only CURRENT is handled for now.
@@ -193,14 +289,16 @@ void VP8StoreBlock(VP8Decoder* const dec) {
}
info->f_ilevel_ = (level < 1) ? 1 : level;
info->f_inner_ = (!info->skip_ || dec->is_i4x4_);
info->f_inner_ = (!skip || dec->is_i4x4_);
}
{
// Transfer samples to row cache
int y;
uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16;
uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8;
uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8;
const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
for (y = 0; y < 16; ++y) {
memcpy(ydst + y * dec->cache_y_stride_,
dec->yuv_b_ + Y_OFF + y * BPS, 16);
@@ -214,56 +312,205 @@ void VP8StoreBlock(VP8Decoder* const dec) {
}
}
//------------------------------------------------------------------------------
// This function is called after a row of macroblocks is finished decoding.
// It also takes into account the following restrictions:
// * In case of in-loop filtering, we must hold off sending some of the bottom
// pixels as they are yet unfiltered. They will be when the next macroblock
// row is decoded. Meanwhile, we must preserve them by rotating them in the
// cache area. This doesn't hold for the very bottom row of the uncropped
// picture of course.
// * we must clip the remaining pixels against the cropping area. The VP8Io
// struct must have the following fields set correctly before calling put():
#define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB
// Finalize and transmit a complete row. Return false in case of user-abort.
int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
int ok = 1;
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
const int ysize = extra_y_rows * dec->cache_y_stride_;
const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
const int first_row = (dec->mb_y_ == 0);
const int last_row = (dec->mb_y_ >= dec->mb_h_ - 1);
uint8_t* const ydst = dec->cache_y_ - ysize;
uint8_t* const udst = dec->cache_u_ - uvsize;
uint8_t* const vdst = dec->cache_v_ - uvsize;
if (dec->filter_type_ > 0) {
int mb_x;
for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
DoFilter(dec, mb_x, dec->mb_y_);
}
const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
const int first_row = (ctx->mb_y_ == 0);
const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
if (ctx->filter_row_) {
FilterRow(dec);
}
if (io->put) {
int y_start = dec->mb_y_ * 16;
int y_end = y_start + 16;
if (!first_row) {
y_start -= extra_y_rows;
io->y = ydst;
io->u = udst;
io->v = vdst;
} else {
io->y = dec->cache_y_;
io->u = dec->cache_u_;
io->v = dec->cache_v_;
io->y = dec->cache_y_ + y_offset;
io->u = dec->cache_u_ + uv_offset;
io->v = dec->cache_v_ + uv_offset;
}
if (!last_row) {
y_end -= extra_y_rows;
}
if (y_end > io->height) {
y_end = io->height;
if (y_end > io->crop_bottom) {
y_end = io->crop_bottom; // make sure we don't overflow on last row.
}
io->mb_y = y_start;
io->mb_h = y_end - y_start;
if (!io->put(io)) {
return 0;
io->a = NULL;
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (dec->alpha_data_) {
io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
if (io->a == NULL) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"Could not decode alpha data.");
}
}
#endif
if (y_start < io->crop_top) {
const int delta_y = io->crop_top - y_start;
y_start = io->crop_top;
assert(!(delta_y & 1));
io->y += dec->cache_y_stride_ * delta_y;
io->u += dec->cache_uv_stride_ * (delta_y >> 1);
io->v += dec->cache_uv_stride_ * (delta_y >> 1);
if (io->a) {
io->a += io->width * delta_y;
}
}
if (y_start < y_end) {
io->y += io->crop_left;
io->u += io->crop_left >> 1;
io->v += io->crop_left >> 1;
if (io->a) {
io->a += io->crop_left;
}
io->mb_y = y_start - io->crop_top;
io->mb_w = io->crop_right - io->crop_left;
io->mb_h = y_end - y_start;
ok = io->put(io);
}
}
// rotate top samples
if (!last_row) {
memcpy(ydst, ydst + 16 * dec->cache_y_stride_, ysize);
memcpy(udst, udst + 8 * dec->cache_uv_stride_, uvsize);
memcpy(vdst, vdst + 8 * dec->cache_uv_stride_, uvsize);
// rotate top samples if needed
if (ctx->id_ + 1 == dec->num_caches_) {
if (!last_row) {
memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
}
}
return 1;
return ok;
}
//-----------------------------------------------------------------------------
#undef MACROBLOCK_VPOS
//------------------------------------------------------------------------------
int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
int ok = 1;
VP8ThreadContext* const ctx = &dec->thread_ctx_;
if (!dec->use_threads_) {
// ctx->id_ and ctx->f_info_ are already set
ctx->mb_y_ = dec->mb_y_;
ctx->filter_row_ = dec->filter_row_;
ok = VP8FinishRow(dec, io);
} else {
WebPWorker* const worker = &dec->worker_;
// Finish previous job *before* updating context
ok &= WebPWorkerSync(worker);
assert(worker->status_ == OK);
if (ok) { // spawn a new deblocking/output job
ctx->io_ = *io;
ctx->id_ = dec->cache_id_;
ctx->mb_y_ = dec->mb_y_;
ctx->filter_row_ = dec->filter_row_;
if (ctx->filter_row_) { // just swap filter info
VP8FInfo* const tmp = ctx->f_info_;
ctx->f_info_ = dec->f_info_;
dec->f_info_ = tmp;
}
WebPWorkerLaunch(worker);
if (++dec->cache_id_ == dec->num_caches_) {
dec->cache_id_ = 0;
}
}
}
return ok;
}
//------------------------------------------------------------------------------
// Finish setting up the decoding parameter once user's setup() is called.
VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
// Call setup() first. This may trigger additional decoding features on 'io'.
// Note: Afterward, we must call teardown() not matter what.
if (io->setup && !io->setup(io)) {
VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
return dec->status_;
}
// Disable filtering per user request
if (io->bypass_filtering) {
dec->filter_type_ = 0;
}
// TODO(skal): filter type / strength / sharpness forcing
// Define the area where we can skip in-loop filtering, in case of cropping.
//
// 'Simple' filter reads two luma samples outside of the macroblock and
// and filters one. It doesn't filter the chroma samples. Hence, we can
// avoid doing the in-loop filtering before crop_top/crop_left position.
// For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
// Means: there's a dependency chain that goes all the way up to the
// top-left corner of the picture (MB #0). We must filter all the previous
// macroblocks.
// TODO(skal): add an 'approximate_decoding' option, that won't produce
// a 1:1 bit-exactness for complex filtering?
{
const int extra_pixels = kFilterExtraRows[dec->filter_type_];
if (dec->filter_type_ == 2) {
// For complex filter, we need to preserve the dependency chain.
dec->tl_mb_x_ = 0;
dec->tl_mb_y_ = 0;
} else {
// For simple filter, we can filter only the cropped region.
dec->tl_mb_y_ = io->crop_top >> 4;
dec->tl_mb_x_ = io->crop_left >> 4;
}
// We need some 'extra' pixels on the right/bottom.
dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
if (dec->br_mb_x_ > dec->mb_w_) {
dec->br_mb_x_ = dec->mb_w_;
}
if (dec->br_mb_y_ > dec->mb_h_) {
dec->br_mb_y_ = dec->mb_h_;
}
}
return VP8_STATUS_OK;
}
int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
int ok = 1;
if (dec->use_threads_) {
ok = WebPWorkerSync(&dec->worker_);
}
if (io->teardown) {
io->teardown(io);
}
return ok;
}
//------------------------------------------------------------------------------
// Main reconstruction function.
static const int kScan[16] = {
@@ -358,7 +605,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
uint8_t* const dst = y_dst + kScan[n];
VP8PredLuma4[dec->imodes_[n]](dst);
if (dec->non_zero_ac_ & (1 << n)) {
VP8Transform(coeffs + n * 16, dst);
VP8Transform(coeffs + n * 16, dst, 0);
} else if (dec->non_zero_ & (1 << n)) { // only DC is present
VP8TransformDC(coeffs + n * 16, dst);
}
@@ -370,7 +617,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
for (n = 0; n < 16; n++) {
uint8_t* const dst = y_dst + kScan[n];
if (dec->non_zero_ac_ & (1 << n)) {
VP8Transform(coeffs + n * 16, dst);
VP8Transform(coeffs + n * 16, dst, 0);
} else if (dec->non_zero_ & (1 << n)) { // only DC is present
VP8TransformDC(coeffs + n * 16, dst);
}
@@ -410,7 +657,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -15,15 +15,11 @@
#include "webpi.h"
#include "vp8i.h"
#include "yuv.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#define RIFF_HEADER_SIZE 20
#define VP8_HEADER_SIZE 10
#define WEBP_HEADER_SIZE (RIFF_HEADER_SIZE + VP8_HEADER_SIZE)
#define CHUNK_SIZE 4096
#define MAX_MB_SIZE 4096
@@ -32,14 +28,20 @@ extern "C" {
// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
// If there is any error the decoder goes into state ERROR.
typedef enum { STATE_HEADER = 0, STATE_PARTS0 = 1,
STATE_DATA = 2, STATE_DONE = 3,
STATE_ERROR = 4
typedef enum {
STATE_PRE_VP8, // All data before that of the first VP8 chunk.
STATE_VP8_FRAME_HEADER, // For VP8 Frame header (within VP8 chunk).
STATE_VP8_PARTS0,
STATE_VP8_DATA,
STATE_DONE,
STATE_ERROR
} DecState;
// Operating state for the MemBuffer
typedef enum { MEM_MODE_NONE = 0,
MEM_MODE_APPEND, MEM_MODE_MAP
typedef enum {
MEM_MODE_NONE = 0,
MEM_MODE_APPEND,
MEM_MODE_MAP
} MemBufferMode;
// storage for partition #0 and partial data (in a rolling fashion)
@@ -56,12 +58,13 @@ typedef struct {
struct WebPIDecoder {
DecState state_; // current decoding state
int w_, h_; // width and height
WebPDecParams params_; // Params to store output info
VP8Decoder* dec_;
VP8Io io_;
MemBuffer mem_; // memory buffer
MemBuffer mem_; // input memory buffer.
WebPDecBuffer output_; // output buffer (when no external one is supplied)
uint32_t vp8_size_; // VP8 size extracted from VP8 Header.
};
// MB context to restore in case VP8DecodeMB() fails
@@ -229,43 +232,63 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
//------------------------------------------------------------------------------
static VP8StatusCode IDecError(WebPIDecoder* idec, VP8StatusCode error) {
static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
if (idec->state_ == STATE_VP8_DATA) {
VP8Io* const io = &idec->io_;
if (io->teardown) {
io->teardown(io);
}
}
idec->state_ = STATE_ERROR;
return error;
}
// Header
static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) {
int width, height;
uint32_t curr_size, riff_header_size, bits;
WebPDecParams* params = &idec->params_;
const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
static void ChangeState(WebPIDecoder* const idec, DecState new_state,
uint32_t consumed_bytes) {
idec->state_ = new_state;
idec->mem_.start_ += consumed_bytes;
assert(idec->mem_.start_ <= idec->mem_.end_);
}
if (MemDataSize(&idec->mem_) < WEBP_HEADER_SIZE) {
// Headers
static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
uint32_t curr_size = MemDataSize(&idec->mem_);
uint32_t vp8_size;
uint32_t bytes_skipped;
VP8StatusCode status;
status = WebPParseHeaders(&data, &curr_size, &vp8_size, &bytes_skipped);
if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
return VP8_STATUS_SUSPENDED; // We haven't found a VP8 chunk yet.
} else if (status == VP8_STATUS_OK) {
idec->vp8_size_ = vp8_size;
ChangeState(idec, STATE_VP8_FRAME_HEADER, bytes_skipped);
return VP8_STATUS_OK; // We have skipped all pre-VP8 chunks.
} else {
return IDecError(idec, status);
}
}
static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
const uint32_t curr_size = MemDataSize(&idec->mem_);
uint32_t bits;
if (curr_size < VP8_FRAME_HEADER_SIZE) {
// Not enough data bytes to extract VP8 Frame Header.
return VP8_STATUS_SUSPENDED;
}
if (!WebPInitDecParams(data, idec->mem_.end_, &width, &height, params)) {
if (!VP8GetInfo(data, curr_size, idec->vp8_size_, NULL, NULL, NULL)) {
return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
}
// Validate and Skip over RIFF header
curr_size = MemDataSize(&idec->mem_);
if (!WebPCheckRIFFHeader(&data, &curr_size)) {
return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
}
riff_header_size = idec->mem_.end_ - curr_size;
bits = data[0] | (data[1] << 8) | (data[2] << 16);
idec->mem_.part0_size_ = (bits >> 5) + VP8_FRAME_HEADER_SIZE;
idec->mem_.part0_size_ = (bits >> 5) + VP8_HEADER_SIZE;
idec->mem_.start_ += riff_header_size;
assert(idec->mem_.start_ <= idec->mem_.end_);
idec->w_ = width;
idec->h_ = height;
idec->io_.data_size -= riff_header_size;
idec->io_.data_size = curr_size;
idec->io_.data = data;
idec->state_ = STATE_PARTS0;
idec->state_ = STATE_VP8_PARTS0;
return VP8_STATUS_OK;
}
@@ -298,14 +321,13 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
VP8Decoder* const dec = idec->dec_;
VP8Io* const io = &idec->io_;
const WebPDecParams* const params = &idec->params_;
const WEBP_CSP_MODE mode = params->mode;
WebPDecBuffer* const output = params->output;
// Wait till we have enough data for the whole partition #0
if (MemDataSize(&idec->mem_) < idec->mem_.part0_size_) {
return VP8_STATUS_SUSPENDED;
}
io->opaque = &idec->params_;
if (!VP8GetHeaders(dec, io)) {
const VP8StatusCode status = dec->status_;
if (status == VP8_STATUS_SUSPENDED ||
@@ -316,36 +338,35 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
return IDecError(idec, status);
}
if (!WebPCheckDecParams(io, params)) {
return IDecError(idec, VP8_STATUS_INVALID_PARAM);
// Allocate/Verify output buffer now
dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
output);
if (dec->status_ != VP8_STATUS_OK) {
return IDecError(idec, dec->status_);
}
if (mode != MODE_YUV) {
VP8YUVInit();
}
// allocate memory and prepare everything.
if (!VP8InitFrame(dec, io)) {
return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
}
if (io->setup && !io->setup(io)) {
return IDecError(idec, VP8_STATUS_USER_ABORT);
}
// disable filtering per user request (_after_ setup() is called)
if (io->bypass_filtering) dec->filter_type_ = 0;
if (!CopyParts0Data(idec)) {
return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
}
idec->state_ = STATE_DATA;
// Finish setting up the decoding parameters. Will call io->setup().
if (VP8EnterCritical(dec, io) != VP8_STATUS_OK) {
return IDecError(idec, dec->status_);
}
// Note: past this point, teardown() must always be called
// in case of error.
idec->state_ = STATE_VP8_DATA;
// Allocate memory and prepare everything.
if (!VP8InitFrame(dec, io)) {
return IDecError(idec, dec->status_);
}
return VP8_STATUS_OK;
}
// Remaining partitions
static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
VP8BitReader* br;
VP8BitReader* br;
VP8Decoder* const dec = idec->dec_;
VP8Io* const io = &idec->io_;
@@ -355,12 +376,8 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
if (dec->mb_x_ == 0) {
VP8MB* const left = dec->mb_info_ - 1;
left->nz_ = 0;
left->dc_nz_ = 0;
memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
VP8InitScanline(dec);
}
for (; dec->mb_x_ < dec->mb_w_; dec->mb_x_++) {
MBContext context;
SaveContext(dec, token_br, &context);
@@ -383,14 +400,14 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
assert(idec->mem_.start_ <= idec->mem_.end_);
}
}
if (!VP8FinishRow(dec, io)) {
if (!VP8ProcessRow(dec, io)) {
return IDecError(idec, VP8_STATUS_USER_ABORT);
}
dec->mb_x_ = 0;
}
if (io->teardown) {
io->teardown(io);
// Synchronize the thread and check for errors.
if (!VP8ExitCritical(dec, io)) {
return IDecError(idec, VP8_STATUS_USER_ABORT);
}
dec->ready_ = 0;
idec->state_ = STATE_DONE;
@@ -403,14 +420,17 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
VP8StatusCode status = VP8_STATUS_SUSPENDED;
assert(idec->dec_);
if (idec->state_ == STATE_HEADER) {
status = DecodeHeader(idec);
if (idec->state_ == STATE_PRE_VP8) {
status = DecodeWebPHeaders(idec);
}
if (idec->state_ == STATE_PARTS0) {
if (idec->state_ == STATE_VP8_FRAME_HEADER) {
status = DecodeVP8FrameHeader(idec);
}
if (idec->state_ == STATE_VP8_PARTS0) {
status = DecodePartition0(idec);
}
if (idec->state_ == STATE_DATA) {
return DecodeRemaining(idec);
if (idec->state_ == STATE_VP8_DATA) {
status = DecodeRemaining(idec);
}
return status;
}
@@ -418,9 +438,11 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
//------------------------------------------------------------------------------
// Public functions
WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) {
WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
if (!idec) return NULL;
if (idec == NULL) {
return NULL;
}
idec->dec_ = VP8New();
if (idec->dec_ == NULL) {
@@ -428,53 +450,97 @@ WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
return NULL;
}
idec->state_ = STATE_HEADER;
idec->params_.mode = mode;
idec->state_ = STATE_PRE_VP8;
InitMemBuffer(&idec->mem_);
WebPInitDecBuffer(&idec->output_);
VP8InitIo(&idec->io_);
WebPInitCustomIo(&idec->io_);
WebPResetDecParams(&idec->params_);
idec->params_.output = output_buffer ? output_buffer : &idec->output_;
WebPInitCustomIo(&idec->params_, &idec->io_); // Plug the I/O functions.
#ifdef WEBP_USE_THREAD
idec->dec_->use_threads_ = idec->params_.options &&
(idec->params_.options->use_threads > 0);
#else
idec->dec_->use_threads_ = 0;
#endif
idec->vp8_size_ = 0;
return idec;
}
WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
WebPDecoderConfig* const config) {
WebPIDecoder* idec;
// Parse the bitstream's features, if requested:
if (data != NULL && data_size > 0 && config != NULL) {
if (WebPGetFeatures(data, data_size, &config->input) != VP8_STATUS_OK) {
return NULL;
}
}
// Create an instance of the incremental decoder
idec = WebPINewDecoder(config ? &config->output : NULL);
if (!idec) {
return NULL;
}
// Finish initialization
if (config != NULL) {
idec->params_.options = &config->options;
}
return idec;
}
void WebPIDelete(WebPIDecoder* const idec) {
if (!idec) return;
VP8Delete(idec->dec_);
WebPClearDecParams(&idec->params_);
ClearMemBuffer(&idec->mem_);
WebPFreeDecBuffer(&idec->output_);
free(idec);
}
//------------------------------------------------------------------------------
// Wrapper toward WebPINewDecoder
WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
WebPIDecoder* const idec = WebPINewDecoder(NULL);
if (!idec) return NULL;
idec->output_.colorspace = mode;
return idec;
}
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
int output_buffer_size, int output_stride) {
WebPIDecoder* idec;
if (mode == MODE_YUV) return NULL;
idec = WebPINew(mode);
if (idec == NULL) return NULL;
idec->params_.output = output_buffer;
idec->params_.stride = output_stride;
idec->params_.output_size = output_buffer_size;
idec->params_.external_buffer = 1;
if (mode >= MODE_YUV) return NULL;
idec = WebPINewDecoder(NULL);
if (!idec) return NULL;
idec->output_.colorspace = mode;
idec->output_.is_external_memory = 1;
idec->output_.u.RGBA.rgba = output_buffer;
idec->output_.u.RGBA.stride = output_stride;
idec->output_.u.RGBA.size = output_buffer_size;
return idec;
}
WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
uint8_t* u, int u_size, int u_stride,
uint8_t* v, int v_size, int v_stride) {
WebPIDecoder* idec = WebPINew(MODE_YUV);
if (idec == NULL) return NULL;
idec->params_.output = luma;
idec->params_.stride = luma_stride;
idec->params_.output_size = luma_size;
idec->params_.u = u;
idec->params_.u_stride = u_stride;
idec->params_.output_u_size = u_size;
idec->params_.v = v;
idec->params_.v_stride = v_stride;
idec->params_.output_v_size = v_size;
idec->params_.external_buffer = 1;
WebPIDecoder* const idec = WebPINewDecoder(NULL);
if (!idec) return NULL;
idec->output_.colorspace = MODE_YUV;
idec->output_.is_external_memory = 1;
idec->output_.u.YUVA.y = luma;
idec->output_.u.YUVA.y_stride = luma_stride;
idec->output_.u.YUVA.y_size = luma_size;
idec->output_.u.YUVA.u = u;
idec->output_.u.YUVA.u_stride = u_stride;
idec->output_.u.YUVA.u_size = u_size;
idec->output_.u.YUVA.v = v;
idec->output_.u.YUVA.v_stride = v_stride;
idec->output_.u.YUVA.v_size = v_size;
return idec;
}
@@ -538,38 +604,81 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
//------------------------------------------------------------------------------
uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y,
int* width, int* height, int* stride) {
if (!idec || !idec->dec_ || idec->params_.mode != MODE_RGB ||
idec->state_ <= STATE_PARTS0) {
static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
if (!idec || !idec->dec_ || idec->state_ <= STATE_VP8_PARTS0) {
return NULL;
}
if (last_y) *last_y = idec->params_.last_y;
if (width) *width = idec->w_;
if (height) *height = idec->h_;
if (stride) *stride = idec->params_.stride;
return idec->params_.output;
}
uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int *last_y,
uint8_t** u, uint8_t** v, int* width, int* height,
int *stride, int* uv_stride) {
if (!idec || !idec->dec_ || idec->params_.mode != MODE_YUV ||
idec->state_ <= STATE_PARTS0) {
const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* const idec,
int* const left, int* const top,
int* const width, int* const height) {
const WebPDecBuffer* const src = GetOutputBuffer(idec);
if (left) *left = 0;
if (top) *top = 0;
// TODO(skal): later include handling of rotations.
if (src) {
if (width) *width = src->width;
if (height) *height = idec->params_.last_y;
} else {
if (width) *width = 0;
if (height) *height = 0;
}
return src;
}
uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
int* width, int* height, int* stride) {
const WebPDecBuffer* const src = GetOutputBuffer(idec);
if (!src) return NULL;
if (src->colorspace >= MODE_YUV) {
return NULL;
}
if (last_y) *last_y = idec->params_.last_y;
if (u) *u = idec->params_.u;
if (v) *v = idec->params_.v;
if (width) *width = idec->w_;
if (height) *height = idec->h_;
if (stride) *stride = idec->params_.stride;
if (uv_stride) *uv_stride = idec->params_.u_stride;
if (width) *width = src->width;
if (height) *height = src->height;
if (stride) *stride = src->u.RGBA.stride;
return idec->params_.output;
return src->u.RGBA.rgba;
}
uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
uint8_t** u, uint8_t** v,
int* width, int* height, int *stride, int* uv_stride) {
const WebPDecBuffer* const src = GetOutputBuffer(idec);
if (!src) return NULL;
if (src->colorspace < MODE_YUV) {
return NULL;
}
if (last_y) *last_y = idec->params_.last_y;
if (u) *u = src->u.YUVA.u;
if (v) *v = src->u.YUVA.v;
if (width) *width = src->width;
if (height) *height = src->height;
if (stride) *stride = src->u.YUVA.y_stride;
if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
return src->u.YUVA.y;
}
int WebPISetIOHooks(WebPIDecoder* const idec,
VP8IoPutHook put,
VP8IoSetupHook setup,
VP8IoTeardownHook teardown,
void* user_data) {
if (!idec || !idec->dec_ || idec->state_ > STATE_PRE_VP8) {
return 0;
}
idec->io_.put = put;
idec->io_.setup = setup;
idec->io_.teardown = teardown;
idec->io_.opaque = user_data;
return 1;
}
#if defined(__cplusplus) || defined(c_plusplus)

668
src/dec/io.c Normal file
View File

@@ -0,0 +1,668 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// functions for sample output.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "../dec/vp8i.h"
#include "./webpi.h"
#include "../dsp/dsp.h"
#include "../dsp/yuv.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// Main YUV<->RGB conversion functions
static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
WebPDecBuffer* output = p->output;
const WebPYUVABuffer* const buf = &output->u.YUVA;
uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
const int uv_w = (mb_w + 1) / 2;
int j;
for (j = 0; j < mb_h; ++j) {
memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
}
for (j = 0; j < (mb_h + 1) / 2; ++j) {
memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
}
return io->mb_h;
}
// Point-sampling U/V sampler.
static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
WebPDecBuffer* output = p->output;
const WebPRGBABuffer* const buf = &output->u.RGBA;
uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
const uint8_t* y_src = io->y;
const uint8_t* u_src = io->u;
const uint8_t* v_src = io->v;
const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace];
const int mb_w = io->mb_w;
const int last = io->mb_h - 1;
int j;
for (j = 0; j < last; j += 2) {
sample(y_src, y_src + io->y_stride, u_src, v_src,
dst, dst + buf->stride, mb_w);
y_src += 2 * io->y_stride;
u_src += io->uv_stride;
v_src += io->uv_stride;
dst += 2 * buf->stride;
}
if (j == last) { // Just do the last line twice
sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
}
return io->mb_h;
}
//------------------------------------------------------------------------------
// YUV444 -> RGB conversion
#if 0 // TODO(skal): this is for future rescaling.
static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
WebPDecBuffer* output = p->output;
const WebPRGBABuffer* const buf = &output->u.RGBA;
uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
const uint8_t* y_src = io->y;
const uint8_t* u_src = io->u;
const uint8_t* v_src = io->v;
const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
const int mb_w = io->mb_w;
const int last = io->mb_h;
int j;
for (j = 0; j < last; ++j) {
convert(y_src, u_src, v_src, dst, mb_w);
y_src += io->y_stride;
u_src += io->uv_stride;
v_src += io->uv_stride;
dst += buf->stride;
}
return io->mb_h;
}
#endif
//------------------------------------------------------------------------------
// Fancy upsampling
#ifdef FANCY_UPSAMPLING
static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
int num_lines_out = io->mb_h; // a priori guess
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
const WebPUpsampleLinePairFunc upsample =
io->a ? WebPUpsamplersKeepAlpha[p->output->colorspace]
: WebPUpsamplers[p->output->colorspace];
const uint8_t* cur_y = io->y;
const uint8_t* cur_u = io->u;
const uint8_t* cur_v = io->v;
const uint8_t* top_u = p->tmp_u;
const uint8_t* top_v = p->tmp_v;
int y = io->mb_y;
int y_end = io->mb_y + io->mb_h;
const int mb_w = io->mb_w;
const int uv_w = (mb_w + 1) / 2;
if (y == 0) {
// First line is special cased. We mirror the u/v samples at boundary.
upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
} else {
// We can finish the left-over line from previous call.
// Warning! Don't overwrite the alpha values (if any), as they
// are not lagging one line behind but are already written.
upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
dst - buf->stride, dst, mb_w);
num_lines_out++;
}
// Loop over each output pairs of row.
for (; y + 2 < y_end; y += 2) {
top_u = cur_u;
top_v = cur_v;
cur_u += io->uv_stride;
cur_v += io->uv_stride;
dst += 2 * buf->stride;
cur_y += 2 * io->y_stride;
upsample(cur_y - io->y_stride, cur_y,
top_u, top_v, cur_u, cur_v,
dst - buf->stride, dst, mb_w);
}
// move to last row
cur_y += io->y_stride;
if (io->crop_top + y_end < io->crop_bottom) {
// Save the unfinished samples for next call (as we're not done yet).
memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
// The fancy upsampler leaves a row unfinished behind
// (except for the very last row)
num_lines_out--;
} else {
// Process the very last row of even-sized picture
if (!(y_end & 1)) {
upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
dst + buf->stride, NULL, mb_w);
}
}
return num_lines_out;
}
#endif /* FANCY_UPSAMPLING */
//------------------------------------------------------------------------------
#ifdef WEBP_EXPERIMENTAL_FEATURES
static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
int j;
const WebPYUVABuffer* const buf = &p->output->u.YUVA;
uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
const uint8_t* alpha = io->a;
if (alpha) {
for (j = 0; j < mb_h; ++j) {
memcpy(dst, alpha, mb_w * sizeof(*dst));
alpha += io->width;
dst += buf->a_stride;
}
}
return 0;
}
static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
int i, j;
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
const uint8_t* alpha = io->a;
if (alpha) {
for (j = 0; j < mb_h; ++j) {
for (i = 0; i < mb_w; ++i) {
dst[4 * i + 3] = alpha[i];
}
alpha += io->width;
dst += buf->stride;
}
}
return 0;
}
#endif /* WEBP_EXPERIMENTAL_FEATURES */
//------------------------------------------------------------------------------
// Simple picture rescaler
// TODO(skal): start a common library for encoder and decoder, and factorize
// this code in.
#define RFIX 30
#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
static void InitRescaler(WebPRescaler* const wrk,
int src_width, int src_height,
uint8_t* dst,
int dst_width, int dst_height, int dst_stride,
int x_add, int x_sub, int y_add, int y_sub,
int32_t* work) {
wrk->x_expand = (src_width < dst_width);
wrk->src_width = src_width;
wrk->src_height = src_height;
wrk->dst_width = dst_width;
wrk->dst_height = dst_height;
wrk->dst = dst;
wrk->dst_stride = dst_stride;
// for 'x_expand', we use bilinear interpolation
wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
wrk->y_accum = y_add;
wrk->y_add = y_add;
wrk->y_sub = y_sub;
wrk->fx_scale = (1 << RFIX) / x_sub;
wrk->fy_scale = (1 << RFIX) / y_sub;
wrk->fxy_scale = wrk->x_expand ?
((int64_t)dst_height << RFIX) / (x_sub * src_height) :
((int64_t)dst_height << RFIX) / (x_add * src_height);
wrk->irow = work;
wrk->frow = work + dst_width;
}
static inline void ImportRow(const uint8_t* const src,
WebPRescaler* const wrk) {
int x_in = 0;
int x_out;
int accum = 0;
if (!wrk->x_expand) {
int sum = 0;
for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
accum += wrk->x_add;
for (; accum > 0; accum -= wrk->x_sub) {
sum += src[x_in++];
}
{ // Emit next horizontal pixel.
const int32_t base = src[x_in++];
const int32_t frac = base * (-accum);
wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
// fresh fractional start for next pixel
sum = MULT(frac, wrk->fx_scale);
}
}
} else { // simple bilinear interpolation
int left = src[0], right = src[0];
for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
if (accum < 0) {
left = right;
right = src[++x_in];
accum += wrk->x_add;
}
wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
accum -= wrk->x_sub;
}
}
// Accumulate the new row's contribution
for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
wrk->irow[x_out] += wrk->frow[x_out];
}
}
static void ExportRow(WebPRescaler* const wrk) {
int x_out;
const int yscale = wrk->fy_scale * (-wrk->y_accum);
assert(wrk->y_accum <= 0);
for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
const int frac = MULT(wrk->frow[x_out], yscale);
const int v = (int)MULT(wrk->irow[x_out] - frac, wrk->fxy_scale);
wrk->dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
wrk->irow[x_out] = frac; // new fractional start
}
wrk->y_accum += wrk->y_add;
wrk->dst += wrk->dst_stride;
}
#undef MULT
#undef RFIX
//------------------------------------------------------------------------------
// YUV rescaling (no final RGB conversion needed)
static int Rescale(const uint8_t* src, int src_stride,
int new_lines, WebPRescaler* const wrk) {
int num_lines_out = 0;
while (new_lines-- > 0) { // import new contribution of one source row.
ImportRow(src, wrk);
src += src_stride;
wrk->y_accum -= wrk->y_sub;
while (wrk->y_accum <= 0) { // emit output row(s)
ExportRow(wrk);
num_lines_out++;
}
}
return num_lines_out;
}
static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
const int mb_h = io->mb_h;
const int uv_mb_h = (mb_h + 1) >> 1;
const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
return num_lines_out;
}
static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
if (io->a) {
Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
}
return 0;
}
static int IsAlphaMode(WEBP_CSP_MODE mode) {
return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB ||
mode == MODE_RGBA_4444 || mode == MODE_YUVA);
}
static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
const int has_alpha = IsAlphaMode(p->output->colorspace);
const WebPYUVABuffer* const buf = &p->output->u.YUVA;
const int out_width = io->scaled_width;
const int out_height = io->scaled_height;
const int uv_out_width = (out_width + 1) >> 1;
const int uv_out_height = (out_height + 1) >> 1;
const int uv_in_width = (io->mb_w + 1) >> 1;
const int uv_in_height = (io->mb_h + 1) >> 1;
const size_t work_size = 2 * out_width; // scratch memory for luma rescaler
const size_t uv_work_size = 2 * uv_out_width; // and for each u/v ones
size_t tmp_size;
int32_t* work;
tmp_size = work_size + 2 * uv_work_size;
if (has_alpha) {
tmp_size += work_size;
}
p->memory = calloc(1, tmp_size * sizeof(*work));
if (p->memory == NULL) {
return 0; // memory error
}
work = (int32_t*)p->memory;
InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
buf->y, out_width, out_height, buf->y_stride,
io->mb_w, out_width, io->mb_h, out_height,
work);
InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
buf->u, uv_out_width, uv_out_height, buf->u_stride,
uv_in_width, uv_out_width,
uv_in_height, uv_out_height,
work + work_size);
InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
buf->v, uv_out_width, uv_out_height, buf->v_stride,
uv_in_width, uv_out_width,
uv_in_height, uv_out_height,
work + work_size + uv_work_size);
p->emit = EmitRescaledYUV;
if (has_alpha) {
InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
buf->a, out_width, out_height, buf->a_stride,
io->mb_w, out_width, io->mb_h, out_height,
work + work_size + 2 * uv_work_size);
p->emit_alpha = EmitRescaledAlphaYUV;
}
return 1;
}
//------------------------------------------------------------------------------
// RGBA rescaling
// import new contributions until one row is ready to be output, or all input
// is consumed.
static int Import(const uint8_t* src, int src_stride,
int new_lines, WebPRescaler* const wrk) {
int num_lines_in = 0;
while (num_lines_in < new_lines && wrk->y_accum > 0) {
ImportRow(src, wrk);
src += src_stride;
++num_lines_in;
wrk->y_accum -= wrk->y_sub;
}
return num_lines_in;
}
static int ExportRGB(WebPDecParams* const p, int y_pos) {
const WebPYUV444Converter convert =
WebPYUV444Converters[p->output->colorspace];
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
int num_lines_out = 0;
// For RGB rescaling, because of the YUV420, current scan position
// U/V can be +1/-1 line from the Y one. Hence the double test.
while (p->scaler_y.y_accum <= 0 && p->scaler_u.y_accum <= 0) {
assert(p->last_y + y_pos + num_lines_out < p->output->height);
assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
ExportRow(&p->scaler_y);
ExportRow(&p->scaler_u);
ExportRow(&p->scaler_v);
convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
dst, p->scaler_y.dst_width);
dst += buf->stride;
num_lines_out++;
}
return num_lines_out;
}
static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
const int mb_h = io->mb_h;
const int uv_mb_h = (mb_h + 1) >> 1;
int j = 0, uv_j = 0;
int num_lines_out = 0;
while (j < mb_h) {
const int y_lines_in = Import(io->y + j * io->y_stride, io->y_stride,
mb_h - j, &p->scaler_y);
const int u_lines_in = Import(io->u + uv_j * io->uv_stride, io->uv_stride,
uv_mb_h - uv_j, &p->scaler_u);
const int v_lines_in = Import(io->v + uv_j * io->uv_stride, io->uv_stride,
uv_mb_h - uv_j, &p->scaler_v);
(void)v_lines_in; // remove a gcc warning
assert(u_lines_in == v_lines_in);
j += y_lines_in;
uv_j += u_lines_in;
num_lines_out += ExportRGB(p, num_lines_out);
}
return num_lines_out;
}
static int ExportAlpha(WebPDecParams* const p, int y_pos) {
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
int num_lines_out = 0;
while (p->scaler_a.y_accum <= 0) {
int i;
assert(p->last_y + y_pos + num_lines_out < p->output->height);
ExportRow(&p->scaler_a);
for (i = 0; i < p->scaler_a.dst_width; ++i) {
dst[4 * i + 3] = p->scaler_a.dst[i];
}
dst += buf->stride;
num_lines_out++;
}
return num_lines_out;
}
static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
if (io->a) {
int j = 0, pos = 0;
while (j < io->mb_h) {
j += Import(io->a + j * io->width, io->width, io->mb_h - j, &p->scaler_a);
pos += ExportAlpha(p, pos);
}
}
return 0;
}
static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
const int has_alpha = IsAlphaMode(p->output->colorspace);
const int out_width = io->scaled_width;
const int out_height = io->scaled_height;
const int uv_in_width = (io->mb_w + 1) >> 1;
const int uv_in_height = (io->mb_h + 1) >> 1;
const size_t work_size = 2 * out_width; // scratch memory for one rescaler
int32_t* work; // rescalers work area
uint8_t* tmp; // tmp storage for scaled YUV444 samples before RGB conversion
size_t tmp_size1, tmp_size2;
tmp_size1 = 3 * work_size;
tmp_size2 = 3 * out_width;
if (has_alpha) {
tmp_size1 += work_size;
tmp_size2 += out_width;
}
p->memory =
calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
if (p->memory == NULL) {
return 0; // memory error
}
work = (int32_t*)p->memory;
tmp = (uint8_t*)(work + tmp_size1);
InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
tmp + 0 * out_width, out_width, out_height, 0,
io->mb_w, out_width, io->mb_h, out_height,
work + 0 * work_size);
InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
tmp + 1 * out_width, out_width, out_height, 0,
io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
work + 1 * work_size);
InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
tmp + 2 * out_width, out_width, out_height, 0,
io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
work + 2 * work_size);
p->emit = EmitRescaledRGB;
if (has_alpha) {
InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
tmp + 3 * out_width, out_width, out_height, 0,
io->mb_w, out_width, io->mb_h, out_height,
work + 3 * work_size);
p->emit_alpha = EmitRescaledAlphaRGB;
}
return 1;
}
//------------------------------------------------------------------------------
// Default custom functions
// Setup crop_xxx fields, mb_w and mb_h
static int InitFromOptions(const WebPDecoderOptions* const options,
VP8Io* const io) {
const int W = io->width;
const int H = io->height;
int x = 0, y = 0, w = W, h = H;
// Cropping
io->use_cropping = (options != NULL) && (options->use_cropping > 0);
if (io->use_cropping) {
w = options->crop_width;
h = options->crop_height;
// TODO(skal): take colorspace into account. Don't assume YUV420.
x = options->crop_left & ~1;
y = options->crop_top & ~1;
if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
return 0; // out of frame boundary error
}
}
io->crop_left = x;
io->crop_top = y;
io->crop_right = x + w;
io->crop_bottom = y + h;
io->mb_w = w;
io->mb_h = h;
// Scaling
io->use_scaling = (options != NULL) && (options->use_scaling > 0);
if (io->use_scaling) {
if (options->scaled_width <= 0 || options->scaled_height <= 0) {
return 0;
}
io->scaled_width = options->scaled_width;
io->scaled_height = options->scaled_height;
}
// Filter
io->bypass_filtering = options && options->bypass_filtering;
// Fancy upsampler
#ifdef FANCY_UPSAMPLING
io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
#endif
if (io->use_scaling) {
// disable filter (only for large downscaling ratio).
io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
(io->scaled_height < H * 3 / 4);
io->fancy_upsampling = 0;
}
return 1;
}
static int CustomSetup(VP8Io* io) {
WebPDecParams* const p = (WebPDecParams*)io->opaque;
const int is_rgb = (p->output->colorspace < MODE_YUV);
p->memory = NULL;
p->emit = NULL;
p->emit_alpha = NULL;
if (!InitFromOptions(p->options, io)) {
return 0;
}
if (io->use_scaling) {
const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
if (!ok) {
return 0; // memory error
}
} else {
if (is_rgb) {
p->emit = EmitSampledRGB; // default
#ifdef FANCY_UPSAMPLING
if (io->fancy_upsampling) {
const int uv_width = (io->mb_w + 1) >> 1;
p->memory = malloc(io->mb_w + 2 * uv_width);
if (p->memory == NULL) {
return 0; // memory error.
}
p->tmp_y = (uint8_t*)p->memory;
p->tmp_u = p->tmp_y + io->mb_w;
p->tmp_v = p->tmp_u + uv_width;
p->emit = EmitFancyRGB;
WebPInitUpsamplers();
}
#endif
} else {
p->emit = EmitYUV;
}
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (IsAlphaMode(p->output->colorspace)) {
// We need transparency output
p->emit_alpha = is_rgb ? EmitAlphaRGB : EmitAlphaYUV;
}
#endif
}
if (is_rgb) {
VP8YUVInit();
}
return 1;
}
//------------------------------------------------------------------------------
static int CustomPut(const VP8Io* io) {
WebPDecParams* p = (WebPDecParams*)io->opaque;
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
int num_lines_out;
assert(!(io->mb_y & 1));
if (mb_w <= 0 || mb_h <= 0) {
return 0;
}
num_lines_out = p->emit(io, p);
if (p->emit_alpha) {
p->emit_alpha(io, p);
}
p->last_y += num_lines_out;
return 1;
}
//------------------------------------------------------------------------------
static void CustomTeardown(const VP8Io* io) {
WebPDecParams* const p = (WebPDecParams*)io->opaque;
free(p->memory);
p->memory = NULL;
}
//------------------------------------------------------------------------------
// Main entry point
void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
io->put = CustomPut;
io->setup = CustomSetup;
io->teardown = CustomTeardown;
io->opaque = params;
}
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

34
src/dec/layer.c Normal file
View File

@@ -0,0 +1,34 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Enhancement layer (for YUV444/422)
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "vp8i.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
int VP8DecodeLayer(VP8Decoder* const dec) {
assert(dec);
assert(dec->layer_data_size_ > 0);
(void)dec;
// TODO: handle enhancement layer here.
return 1;
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

View File

@@ -58,7 +58,7 @@ static const uint16_t kAcTable[128] = {
249, 254, 259, 264, 269, 274, 279, 284
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Paragraph 9.6
void VP8ParseQuant(VP8Decoder* const dec) {
@@ -104,7 +104,7 @@ void VP8ParseQuant(VP8Decoder* const dec) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -65,7 +65,7 @@ static const int8_t kMVRef4[6] = {
};
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Default probabilities
// Inter
@@ -385,7 +385,7 @@ void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
: VP8GetBit(br, 183) ? TM_PRED : H_PRED;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Paragraph 13
static const uint8_t

View File

@@ -11,18 +11,19 @@
#include <stdlib.h>
#include "vp8i.h"
#include "webpi.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
int WebPGetDecoderVersion(void) {
return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// VP8Decoder
static void SetOk(VP8Decoder* const dec) {
@@ -43,6 +44,7 @@ VP8Decoder* VP8New(void) {
VP8Decoder* dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
if (dec) {
SetOk(dec);
WebPWorkerInit(&dec->worker_);
dec->ready_ = 0;
}
return dec;
@@ -74,7 +76,56 @@ int VP8SetError(VP8Decoder* const dec,
return 0;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
int VP8GetInfo(const uint8_t* data, uint32_t data_size, uint32_t chunk_size,
int* width, int* height, int* has_alpha) {
if (data_size < 10) {
return 0; // not enough data
}
// check signature
if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) {
return 0; // Wrong signature.
} else {
const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
const int key_frame = !(bits & 1);
const int w = ((data[7] << 8) | data[6]) & 0x3fff;
const int h = ((data[9] << 8) | data[8]) & 0x3fff;
if (has_alpha) {
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (data_size < 11) return 0;
*has_alpha = !!(data[10] & 0x80); // the colorspace_ bit
#else
*has_alpha = 0;
#endif
}
if (!key_frame) { // Not a keyframe.
return 0;
}
if (((bits >> 1) & 7) > 3) {
return 0; // unknown profile
}
if (!((bits >> 4) & 1)) {
return 0; // first frame is invisible!
}
if (((bits >> 5)) >= chunk_size) { // partition_length
return 0; // inconsistent size information.
}
if (width) {
*width = w;
}
if (height) {
*height = h;
}
return 1;
}
}
//------------------------------------------------------------------------------
// Header parsing
static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
@@ -194,14 +245,12 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
return !br->eof_;
}
static inline uint32_t get_le32(const uint8_t* const data) {
return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
}
// Topmost call
int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
uint8_t* buf;
const uint8_t* buf;
uint32_t buf_size;
uint32_t vp8_chunk_size;
uint32_t bytes_skipped;
VP8FrameHeader* frm_hdr;
VP8PictureHeader* pic_hdr;
VP8BitReader* br;
@@ -216,41 +265,19 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
"null VP8Io passed to VP8GetHeaders()");
}
buf = (uint8_t *)io->data;
buf = io->data;
buf_size = io->data_size;
if (buf == NULL || buf_size <= 4) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"Not enough data to parse frame header");
// Process Pre-VP8 chunks.
status = WebPParseHeaders(&buf, &buf_size, &vp8_chunk_size, &bytes_skipped);
if (status != VP8_STATUS_OK) {
return VP8SetError(dec, status, "Incorrect/incomplete header.");
}
// Skip over valid RIFF headers
if (!memcmp(buf, "RIFF", 4)) {
uint32_t riff_size;
uint32_t chunk_size;
if (buf_size < 20 + 4) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"RIFF: Truncated header.");
}
if (memcmp(buf + 8, "WEBP", 4)) { // wrong image file signature
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"RIFF: WEBP signature not found.");
}
riff_size = get_le32(buf + 4);
if (riff_size < 12) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"RIFF: Truncated header.");
}
if (memcmp(buf + 12, "VP8 ", 4)) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"RIFF: Invalid compression format.");
}
chunk_size = get_le32(buf + 16);
if (chunk_size > riff_size - 12) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"RIFF: Inconsistent size information.");
}
buf += 20;
buf_size -= 20;
// Process the VP8 frame header.
if (buf_size < 4) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"Truncated header.");
}
// Paragraph 9.1
@@ -291,8 +318,17 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
// Setup default output area (can be later modified during io->setup())
io->width = pic_hdr->width_;
io->height = pic_hdr->height_;
io->use_scaling = 0;
io->use_cropping = 0;
io->crop_top = 0;
io->crop_left = 0;
io->crop_right = io->width;
io->crop_bottom = io->height;
io->mb_w = io->width; // sanity check
io->mb_h = io->height; // ditto
VP8ResetProba(&dec->proba_);
ResetSegmentHeader(&dec->segment_hdr_);
@@ -305,6 +341,10 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"bad partition length");
}
dec->alpha_data_ = NULL;
dec->alpha_data_size_ = 0;
br = &dec->br_;
VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
buf += frm_hdr->partition_length_;
@@ -368,12 +408,42 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
VP8ParseProba(br, dec);
#ifdef WEBP_EXPERIMENTAL_FEATURES
// Extensions
if (dec->pic_hdr_.colorspace_) {
const size_t kTrailerSize = 8;
const uint8_t kTrailerMarker = 0x01;
const uint8_t* ext_buf = buf - kTrailerSize;
size_t size;
if (frm_hdr->partition_length_ < kTrailerSize ||
ext_buf[kTrailerSize - 1] != kTrailerMarker) {
Error:
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"RIFF: Inconsistent extra information.");
}
// Alpha
size = (ext_buf[4] << 0) | (ext_buf[5] << 8) | (ext_buf[6] << 16);
if (frm_hdr->partition_length_ < size + kTrailerSize) {
goto Error;
}
dec->alpha_data_ = (size > 0) ? ext_buf - size : NULL;
dec->alpha_data_size_ = size;
// Layer
size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
dec->layer_data_size_ = size;
dec->layer_data_ = NULL; // will be set later
dec->layer_colorspace_ = ext_buf[3];
}
#endif
// sanitized state
dec->ready_ = 1;
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Residual decoding (Paragraph 13.2 / 13.3)
static const uint8_t kBands[16 + 1] = {
@@ -386,7 +456,7 @@ static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
static const uint8_t kCat6[] =
{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
@@ -422,7 +492,8 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
if (!VP8GetBit(br, p[7])) {
v = 5 + VP8GetBit(br, 159);
} else {
v = 7 + 2 * VP8GetBit(br, 165) + VP8GetBit(br, 145);
v = 7 + 2 * VP8GetBit(br, 165);
v += VP8GetBit(br, 145);
}
} else {
const uint8_t* tab;
@@ -551,7 +622,7 @@ static void ParseResiduals(VP8Decoder* const dec,
}
#undef PACK
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Main loop
int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
@@ -588,16 +659,21 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
return (!token_br->eof_);
}
void VP8InitScanline(VP8Decoder* const dec) {
VP8MB* const left = dec->mb_info_ - 1;
left->nz_ = 0;
left->dc_nz_ = 0;
memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
dec->filter_row_ =
(dec->filter_type_ > 0) &&
(dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
}
static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
for (dec->mb_y_ = 0; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
VP8MB* const left = dec->mb_info_ - 1;
for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
VP8BitReader* const token_br =
&dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
left->nz_ = 0;
left->dc_nz_ = 0;
memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
VP8InitScanline(dec);
for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_; dec->mb_x_++) {
if (!VP8DecodeMB(dec, token_br)) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
@@ -608,11 +684,13 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
// Store data and save block's filtering params
VP8StoreBlock(dec);
}
if (!VP8FinishRow(dec, io)) {
return VP8SetError(dec, VP8_STATUS_USER_ABORT,
"Output aborted.");
if (!VP8ProcessRow(dec, io)) {
return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
}
}
if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
return 0;
}
// Finish
#ifndef ONLY_KEYFRAME_CODE
@@ -621,11 +699,20 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
}
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (dec->layer_data_size_ > 0) {
if (!VP8DecodeLayer(dec)) {
return 0;
}
}
#endif
return 1;
}
// Main entry point
int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
int ok = 0;
if (dec == NULL) {
return 0;
}
@@ -641,32 +728,22 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
}
assert(dec->ready_);
// will allocate memory and prepare everything.
if (!VP8InitFrame(dec, io)) {
VP8Clear(dec);
return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
"Allocation failed");
// Finish setting up the decoding parameter. Will call io->setup().
ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
if (ok) { // good to go.
// Will allocate memory and prepare everything.
if (ok) ok = VP8InitFrame(dec, io);
// Main decoding loop
if (ok) ok = ParseFrame(dec, io);
// Exit.
ok &= VP8ExitCritical(dec, io);
}
if (io->setup && !io->setup(io)) {
if (!ok) {
VP8Clear(dec);
return VP8SetError(dec, VP8_STATUS_USER_ABORT,
"Frame setup failed");
}
// Disable filtering per user request (_after_ setup() is called)
if (io->bypass_filtering) dec->filter_type_ = 0;
// Main decoding loop
{
const int ret = ParseFrame(dec, io);
if (io->teardown) {
io->teardown(io);
}
if (!ret) {
VP8Clear(dec);
return 0;
}
return 0;
}
dec->ready_ = 0;
@@ -677,6 +754,9 @@ void VP8Clear(VP8Decoder* const dec) {
if (dec == NULL) {
return;
}
if (dec->use_threads_) {
WebPWorkerEnd(&dec->worker_);
}
if (dec->mem_) {
free(dec->mem_);
}
@@ -686,7 +766,7 @@ void VP8Clear(VP8Decoder* const dec) {
dec->ready_ = 0;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -13,19 +13,21 @@
#define WEBP_DEC_VP8I_H_
#include <string.h> // for memcpy()
#include "bits.h"
#include "../utils/bit_reader.h"
#include "../utils/thread.h"
#include "../dsp/dsp.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Various defines and enums
// version numbers
#define DEC_MAJ_VERSION 0
#define DEC_MIN_VERSION 1
#define DEC_REV_VERSION 2
#define DEC_REV_VERSION 3
#define ONLY_KEYFRAME_CODE // to remove any code related to P-Frames
@@ -95,7 +97,7 @@ enum { MB_FEATURE_TREE_PROBS = 3,
#define U_OFF (Y_OFF + BPS * 16 + BPS)
#define V_OFF (U_OFF + 16)
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Headers
typedef struct {
@@ -144,19 +146,19 @@ typedef struct {
int mode_lf_delta_[NUM_MODE_LF_DELTAS];
} VP8FilterHeader;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Informations about the macroblocks.
typedef struct {
// block type
uint8_t skip_:1;
// filter specs
uint8_t f_level_:6; // filter strength: 0..63
uint8_t f_ilevel_:6; // inner limit: 1..63
uint8_t f_inner_:1; // do inner filtering?
// cbp
uint8_t nz_; // non-zero AC/DC coeffs
uint8_t dc_nz_; // non-zero DC coeffs
typedef struct { // filter specs
unsigned int f_level_:6; // filter strength: 0..63
unsigned int f_ilevel_:6; // inner limit: 1..63
unsigned int f_inner_:1; // do inner filtering?
} VP8FInfo;
typedef struct { // used for syntax-parsing
unsigned int nz_; // non-zero AC/DC coeffs
unsigned int dc_nz_:1; // non-zero DC coeffs
unsigned int skip_:1; // block type
} VP8MB;
// Dequantization matrices
@@ -164,7 +166,16 @@ typedef struct {
uint16_t y1_mat_[2], y2_mat_[2], uv_mat_[2]; // [DC / AC]
} VP8QuantMatrix;
//-----------------------------------------------------------------------------
// Persistent information needed by the parallel processing
typedef struct {
int id_; // cache row to process (in [0..2])
int mb_y_; // macroblock position of the row
int filter_row_; // true if row-filtering is needed
VP8FInfo* f_info_; // filter strengths
VP8Io io_; // copy of the VP8Io to pass to put()
} VP8ThreadContext;
//------------------------------------------------------------------------------
// VP8Decoder: the main opaque structure handed over to user
struct VP8Decoder {
@@ -181,9 +192,20 @@ struct VP8Decoder {
VP8FilterHeader filter_hdr_;
VP8SegmentHeader segment_hdr_;
// Worker
WebPWorker worker_;
int use_threads_; // use multi-thread
int cache_id_; // current cache row
int num_caches_; // number of cached rows of 16 pixels (1, 2 or 3)
VP8ThreadContext thread_ctx_; // Thread context
// dimension, in macroblock units.
int mb_w_, mb_h_;
// Macroblock to process/filter, depending on cropping and filter_type.
int tl_mb_x_, tl_mb_y_; // top-left MB that must be in-loop filtered
int br_mb_x_, br_mb_y_; // last bottom-right MB that must be decoded
// number of partitions.
int num_parts_;
// per-partition boolean decoders.
@@ -212,10 +234,11 @@ struct VP8Decoder {
// Boundary data cache and persistent buffers.
uint8_t* intra_t_; // top intra modes values: 4 * mb_w_
uint8_t intra_l_[4]; // left intra modes values
uint8_t *y_t_; // top luma samples: 16 * mb_w_
uint8_t *u_t_, *v_t_; // top u/v samples: 8 * mb_w_ each
uint8_t* y_t_; // top luma samples: 16 * mb_w_
uint8_t* u_t_, *v_t_; // top u/v samples: 8 * mb_w_ each
VP8MB* mb_info_; // contextual macroblock infos (mb_w_ + 1)
VP8MB* mb_info_; // contextual macroblock info (mb_w_ + 1)
VP8FInfo* f_info_; // filter strength info
uint8_t* yuv_b_; // main block for Y/U/V (size = YUV_SIZE)
int16_t* coeffs_; // 384 coeffs = (16+8+8) * 4*4
@@ -244,17 +267,35 @@ struct VP8Decoder {
uint32_t non_zero_ac_;
// Filtering side-info
int filter_type_; // 0=off, 1=simple, 2=complex
int filter_type_; // 0=off, 1=simple, 2=complex
int filter_row_; // per-row flag
uint8_t filter_levels_[NUM_MB_SEGMENTS]; // precalculated per-segment
// extensions
const uint8_t* alpha_data_; // compressed alpha data (if present)
size_t alpha_data_size_;
uint8_t* alpha_plane_; // output
int layer_colorspace_;
const uint8_t* layer_data_; // compressed layer data (if present)
size_t layer_data_size_;
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// internal functions. Not public.
// in vp8.c
int VP8SetError(VP8Decoder* const dec,
VP8StatusCode error, const char * const msg);
// Validates the VP8 data-header and retrieve basic header information viz width
// and height. Returns 0 in case of formatting error. *width/*height/*has_alpha
// can be passed NULL.
int VP8GetInfo(const uint8_t* data,
uint32_t data_size, // data available so far
uint32_t chunk_size, // total data size expect in the chunk
int *width, int *height, int *has_alpha);
// in tree.c
void VP8ResetProba(VP8Proba* const proba);
void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
@@ -267,59 +308,38 @@ void VP8ParseQuant(VP8Decoder* const dec);
int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
// Predict a block and add residual
void VP8ReconstructBlock(VP8Decoder* const dec);
// Call io->setup() and finish setting up scan parameters.
// After this call returns, one must always call VP8ExitCritical() with the
// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
// if ok, otherwise sets and returns the error status on *dec.
VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
// Must always be called in pair with VP8EnterCritical().
// Returns false in case of error.
int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
// Filter the decoded macroblock row (if needed)
int VP8FinishRow(VP8Decoder* const dec, VP8Io* io); // multi threaded call
// Process the last decoded row (filtering + output)
int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
// Store a block, along with filtering params
void VP8StoreBlock(VP8Decoder* const dec);
// Finalize and transmit a complete row. Return false in case of user-abort.
int VP8FinishRow(VP8Decoder* const dec, VP8Io* io);
int VP8FinishRow(VP8Decoder* const dec, VP8Io* const io);
// To be called at the start of a new scanline, to initialize predictors.
void VP8InitScanline(VP8Decoder* const dec);
// Decode one macroblock. Returns false if there is not enough data.
int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
// in dsp.c
typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
extern VP8Idct VP8Transform;
extern VP8Idct VP8TransformUV;
extern VP8Idct VP8TransformDC;
extern VP8Idct VP8TransformDCUV;
extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
// in alpha.c
const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
int row, int num_rows);
// *dst is the destination block, with stride BPS. Boundary samples are
// assumed accessible when needed.
typedef void (*VP8PredFunc)(uint8_t *dst);
extern VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
extern VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
extern VP8PredFunc VP8PredLuma4[NUM_BMODES];
// in layer.c
int VP8DecodeLayer(VP8Decoder* const dec);
void VP8DspInit(void); // must be called before anything using the above
void VP8DspInitTables(void); // needs to be called no matter what.
// simple filter (only for luma)
typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
extern VP8SimpleFilterFunc VP8SimpleVFilter16;
extern VP8SimpleFilterFunc VP8SimpleHFilter16;
extern VP8SimpleFilterFunc VP8SimpleVFilter16i; // filter 3 inner edges
extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
// regular filter (on both macroblock edges and inner edges)
typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
int thresh, int ithresh, int hev_t);
typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_t);
// on outter edge
extern VP8LumaFilterFunc VP8VFilter16;
extern VP8LumaFilterFunc VP8HFilter16;
extern VP8ChromaFilterFunc VP8VFilter8;
extern VP8ChromaFilterFunc VP8HFilter8;
// on inner edge
extern VP8LumaFilterFunc VP8VFilter16i; // filtering 3 inner edges altogether
extern VP8LumaFilterFunc VP8HFilter16i;
extern VP8ChromaFilterFunc VP8VFilter8i; // filtering u and v altogether
extern VP8ChromaFilterFunc VP8HFilter8i;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // WEBP_DEC_VP8I_H_
#endif /* WEBP_DEC_VP8I_H_ */

File diff suppressed because it is too large Load Diff

View File

@@ -9,55 +9,155 @@
//
// Author: somnath@google.com (Somnath Banerjee)
#ifndef WEBP_DEC_WEBPI_H
#define WEBP_DEC_WEBPI_H
#ifndef WEBP_DEC_WEBPI_H_
#define WEBP_DEC_WEBPI_H_
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#include "webp/decode_vp8.h"
#include "../webp/decode_vp8.h"
// Decoding output parameters.
//------------------------------------------------------------------------------
// WebPDecParams: Decoding output parameters. Transient internal object.
typedef struct WebPDecParams WebPDecParams;
typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
// Structure use for on-the-fly rescaling
typedef struct {
uint8_t* output; // rgb(a) or luma
uint8_t *u, *v; // chroma u/v
uint8_t *top_y, *top_u, *top_v; // cache for the fancy upscaler
int stride; // rgb(a) stride or luma stride
int u_stride; // chroma-u stride
int v_stride; // chroma-v stride
WEBP_CSP_MODE mode; // rgb(a) or yuv
int last_y; // coordinate of the line that was last output
int output_size; // size of 'output' buffer
int output_u_size; // size of 'u' buffer
int output_v_size; // size of 'v' buffer
int external_buffer; // If true, the output buffers are externally owned
} WebPDecParams;
int x_expand; // true if we're expanding in the x direction
int fy_scale, fx_scale; // fixed-point scaling factor
int64_t fxy_scale; // ''
// we need hpel-precise add/sub increments, for the downsampled U/V planes.
int y_accum; // vertical accumulator
int y_add, y_sub; // vertical increments (add ~= src, sub ~= dst)
int x_add, x_sub; // horizontal increments (add ~= src, sub ~= dst)
int src_width, src_height; // source dimensions
int dst_width, dst_height; // destination dimensions
uint8_t* dst;
int dst_stride;
int32_t* irow, *frow; // work buffer
} WebPRescaler;
// If a RIFF container is detected, validate it and skip over it. Returns
// VP8 bit-stream size if RIFF header is valid else returns 0
uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
uint32_t *data_size_ptr);
struct WebPDecParams {
WebPDecBuffer* output; // output buffer.
uint8_t* tmp_y, *tmp_u, *tmp_v; // cache for the fancy upsampler
// or used for tmp rescaling
// Initializes VP8Io with custom setup, io and teardown functions
void WebPInitCustomIo(VP8Io* const io);
int last_y; // coordinate of the line that was last output
const WebPDecoderOptions* options; // if not NULL, use alt decoding features
// rescalers
WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
void* memory; // overall scratch memory for the output work.
OutputFunc emit; // output RGB or YUV samples
OutputFunc emit_alpha; // output alpha channel
};
// Initializes params_out by allocating output buffer and setting the
// stride information. It also outputs width and height information of
// the WebP image. Returns 1 if succeeds.
int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width,
int* height, WebPDecParams* const params_out);
// Should be called first, before any use of the WebPDecParams object.
void WebPResetDecParams(WebPDecParams* const params);
// Verifies various size configurations (e.g stride >= width, specified
// output size <= stride * height etc.). Returns 0 if checks fail.
int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params);
//------------------------------------------------------------------------------
// Header parsing helpers
// Deallocate memory allocated by WebPInitDecParams() and reset the
// WebPDecParams object.
void WebPClearDecParams(WebPDecParams* params);
#define TAG_SIZE 4
#define CHUNK_HEADER_SIZE 8
#define RIFF_HEADER_SIZE 12
#define FRAME_CHUNK_SIZE 20
#define LOOP_CHUNK_SIZE 4
#define TILE_CHUNK_SIZE 8
#define VP8X_CHUNK_SIZE 12
#define VP8_FRAME_HEADER_SIZE 10 // Size of the frame header within VP8 data.
// Validates the RIFF container (if detected) and skips over it.
// If a RIFF container is detected,
// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
// VP8_STATUS_OK otherwise.
// In case there are not enough bytes (partial RIFF container), return 0 for
// riff_size. Else return the riff_size extracted from the header.
VP8StatusCode WebPParseRIFF(const uint8_t** data, uint32_t* data_size,
uint32_t* riff_size);
// Validates the VP8X Header and skips over it.
// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
// VP8_STATUS_OK otherwise.
// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
// that are skipped; also Width, Height & Flags are set to the corresponding
// fields extracted from the VP8X chunk.
VP8StatusCode WebPParseVP8X(const uint8_t** data, uint32_t* data_size,
uint32_t* bytes_skipped,
int* width, int* height, uint32_t* flags);
// Skips to the next VP8 chunk header in the data given the size of the RIFF
// chunk 'riff_size'.
// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
// VP8_STATUS_OK otherwise.
// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
// that are skipped.
VP8StatusCode WebPParseOptionalChunks(const uint8_t** data, uint32_t* data_size,
uint32_t riff_size,
uint32_t* bytes_skipped);
// Validates the VP8 Header ("VP8 nnnn") and skips over it.
// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (vp8_chunk_size greater than
// riff_size) VP8 header,
// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
// VP8_STATUS_OK otherwise.
// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
// that are skipped and vp8_chunk_size is set to the corresponding size
// extracted from the VP8 chunk header.
// For a partial VP8 chunk, vp8_chunk_size is set to 0.
VP8StatusCode WebPParseVP8Header(const uint8_t** data, uint32_t* data_size,
uint32_t riff_size, uint32_t* bytes_skipped,
uint32_t* vp8_chunk_size);
// Skips over all valid chunks prior to the first VP8 frame header.
// Returns VP8_STATUS_OK on success,
// VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and
// VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data.
// Also, data, data_size, vp8_size & bytes_skipped are updated appropriately
// on success, where
// vp8_size is the size of VP8 chunk data (extracted from VP8 chunk header) and
// bytes_skipped is set to the total number of bytes that are skipped.
VP8StatusCode WebPParseHeaders(const uint8_t** data, uint32_t* data_size,
uint32_t* vp8_size, uint32_t* bytes_skipped);
//------------------------------------------------------------------------------
// Misc utils
// Initializes VP8Io with custom setup, io and teardown functions. The default
// hooks will use the supplied 'params' as io->opaque handle.
void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
//------------------------------------------------------------------------------
// Internal functions regarding WebPDecBuffer memory (in buffer.c).
// Don't really need to be externally visible for now.
// Prepare 'buffer' with the requested initial dimensions width/height.
// If no external storage is supplied, initializes buffer by allocating output
// memory and setting up the stride information. Validate the parameters. Return
// an error code in case of problem (no memory, or invalid stride / size /
// dimension / etc.). If *options is not NULL, also verify that the options'
// parameters are valid and apply them to the width/height dimensions of the
// output buffer. This takes cropping / scaling / rotation into account.
VP8StatusCode WebPAllocateDecBuffer(int width, int height,
const WebPDecoderOptions* const options,
WebPDecBuffer* const buffer);
// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
// memory (still held by 'src').
void WebPCopyDecBuffer(const WebPDecBuffer* const src,
WebPDecBuffer* const dst);
// Copy and transfer ownership from src to dst (beware of parameter order!)
void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // WEBP_DEC_WEBPI_H
#endif /* WEBP_DEC_WEBPI_H_ */

View File

@@ -1,66 +0,0 @@
// Copyright 2010 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// inline YUV->RGB conversion function
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_YUV_H_
#define WEBP_DEC_YUV_H_
#include "webp/decode_vp8.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
enum { YUV_FIX = 16, // fixed-point precision
YUV_RANGE_MIN = -227, // min value of r/g/b output
YUV_RANGE_MAX = 256 + 226 // max value of r/g/b output
};
extern int16_t VP8kVToR[256], VP8kUToB[256];
extern int32_t VP8kVToG[256], VP8kUToG[256];
extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgb) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
}
inline static void VP8YuvToRgba(int y, int u, int v, uint8_t* const rgba) {
VP8YuvToRgb(y, u, v, rgba);
rgba[3] = 0xff;
}
inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgr) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
}
inline static void VP8YuvToBgra(int y, int u, int v, uint8_t* const bgra) {
VP8YuvToBgr(y, u, v, bgra);
bgra[3] = 0xff;
}
// Must be called before everything, to initialize the tables.
void VP8YUVInit(void);
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // WEBP_DEC_YUV_H_

14
src/dsp/Makefile.am Normal file
View File

@@ -0,0 +1,14 @@
AM_CPPFLAGS = -I$(top_srcdir)/src
libwebpdsp_la_SOURCES = dsp.h cpu.c \
enc.c enc_sse2.c \
dec.c dec_sse2.c dec_neon.c \
upsampling.c upsampling_sse2.c \
yuv.h yuv.c
libwebpdsp_la_LDFLAGS = -version-info 0:0:0 -lm
libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
libwebpdspinclude_HEADERS = ../webp/types.h
libwebpdspincludedir = $(includedir)/webp
noinst_HEADERS = dsp.h yuv.h
noinst_LTLIBRARIES = libwebpdsp.la

70
src/dsp/cpu.c Normal file
View File

@@ -0,0 +1,70 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// CPU detection
//
// Author: Christian Duvivier (cduvivier@google.com)
#include <stddef.h> // for NULL
#include "./dsp.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// SSE2 detection.
//
#if defined(__pic__) && defined(__i386__)
static inline void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"mov %%ebx, %%edi\n"
"cpuid\n"
"xchg %%edi, %%ebx\n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
}
#elif defined(__i386__) || defined(__x86_64__)
static inline void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"cpuid\n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
}
#elif defined(_MSC_VER) // Visual C++
#define GetCPUInfo __cpuid
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
static int x86CPUInfo(CPUFeature feature) {
int cpu_info[4];
GetCPUInfo(cpu_info, 1);
if (feature == kSSE2) {
return 0 != (cpu_info[3] & 0x04000000);
}
if (feature == kSSE3) {
return 0 != (cpu_info[2] & 0x00000001);
}
return 0;
}
VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
#elif defined(__ARM_NEON__)
// define a dummy function to enable turning off NEON at runtime by setting
// VP8DecGetCPUInfo = NULL
static int armCPUInfo(CPUFeature feature) {
return 1;
}
VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
#else
VP8CPUInfo VP8GetCPUInfo = NULL;
#endif
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

View File

@@ -5,21 +5,18 @@
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// speed-critical functions.
// Speed-critical decoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "vp8i.h"
#if defined(__SSE2__)
#include <emmintrin.h>
#endif
#include "./dsp.h"
#include "../dec/vp8i.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// run-time tables (~4k)
static uint8_t abs0[255 + 255 + 1]; // abs(i)
@@ -32,7 +29,7 @@ static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
void VP8DspInitTables(void) {
static void DspInitTables(void) {
if (!tables_ok) {
int i;
for (i = -255; i <= 255; ++i) {
@@ -56,7 +53,7 @@ static inline uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#define STORE(x, y, v) \
@@ -66,7 +63,7 @@ static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static void Transform(const int16_t* in, uint8_t* dst) {
static void TransformOne(const int16_t* in, uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@@ -106,11 +103,16 @@ static void Transform(const int16_t* in, uint8_t* dst) {
}
#undef MUL
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
}
}
static void TransformUV(const int16_t* in, uint8_t* dst) {
Transform(in + 0 * 16, dst);
Transform(in + 1 * 16, dst + 4);
Transform(in + 2 * 16, dst + 4 * BPS);
Transform(in + 3 * 16, dst + 4 * BPS + 4);
VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
static void TransformDC(const int16_t *in, uint8_t* dst) {
@@ -132,13 +134,7 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
#undef STORE
// default C implementations:
VP8Idct VP8Transform = Transform;
VP8Idct VP8TransformUV = TransformUV;
VP8Idct VP8TransformDC = TransformDC;
VP8Idct VP8TransformDCUV = TransformDCUV;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Paragraph 14.3
static void TransformWHT(const int16_t* in, int16_t* out) {
@@ -170,10 +166,10 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Intra predictions
#define OUT(x, y) dst[(x) + (y) * BPS]
#define DST(x, y) dst[(x) + (y) * BPS]
static inline void TrueMotion(uint8_t *dst, int size) {
const uint8_t* top = dst - BPS;
@@ -192,7 +188,7 @@ static void TM4(uint8_t *dst) { TrueMotion(dst, 4); }
static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
static void TM16(uint8_t *dst) { TrueMotion(dst, 16); }
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// 16x16
static void VE16(uint8_t *dst) { // vertical
@@ -248,7 +244,7 @@ static void DC16NoTopLeft(uint8_t *dst) { // DC with no top and left samples
Put16(0x80, dst);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// 4x4
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@@ -298,13 +294,13 @@ static void RD4(uint8_t *dst) { // Down-right
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
const int D = dst[3 - BPS];
OUT(0, 3) = AVG3(J, K, L);
OUT(0, 2) = OUT(1, 3) = AVG3(I, J, K);
OUT(0, 1) = OUT(1, 2) = OUT(2, 3) = AVG3(X, I, J);
OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
OUT(1, 0) = OUT(2, 1) = OUT(3, 2) = AVG3(B, A, X);
OUT(2, 0) = OUT(3, 1) = AVG3(C, B, A);
OUT(3, 0) = AVG3(D, C, B);
DST(0, 3) = AVG3(J, K, L);
DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t *dst) { // Down-Left
@@ -316,13 +312,13 @@ static void LD4(uint8_t *dst) { // Down-Left
const int F = dst[5 - BPS];
const int G = dst[6 - BPS];
const int H = dst[7 - BPS];
OUT(0, 0) = AVG3(A, B, C);
OUT(1, 0) = OUT(0, 1) = AVG3(B, C, D);
OUT(2, 0) = OUT(1, 1) = OUT(0, 2) = AVG3(C, D, E);
OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
OUT(3, 1) = OUT(2, 2) = OUT(1, 3) = AVG3(E, F, G);
OUT(3, 2) = OUT(2, 3) = AVG3(F, G, H);
OUT(3, 3) = AVG3(G, H, H);
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
static void VR4(uint8_t *dst) { // Vertical-Right
@@ -334,17 +330,17 @@ static void VR4(uint8_t *dst) { // Vertical-Right
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
const int D = dst[3 - BPS];
OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
OUT(3, 0) = AVG2(C, D);
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
OUT(0, 3) = AVG3(K, J, I);
OUT(0, 2) = AVG3(J, I, X);
OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
OUT(3, 1) = AVG3(B, C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t *dst) { // Vertical-Left
@@ -356,17 +352,17 @@ static void VL4(uint8_t *dst) { // Vertical-Left
const int F = dst[5 - BPS];
const int G = dst[6 - BPS];
const int H = dst[7 - BPS];
OUT(0, 0) = AVG2(A, B);
OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
OUT(0, 1) = AVG3(A, B, C);
OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
OUT(3, 2) = AVG3(E, F, G);
OUT(3, 3) = AVG3(F, G, H);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t *dst) { // Horizontal-Up
@@ -374,14 +370,14 @@ static void HU4(uint8_t *dst) { // Horizontal-Up
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
const int L = dst[-1 + 3 * BPS];
OUT(0, 0) = AVG2(I, J);
OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
OUT(1, 0) = AVG3(I, J, K);
OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
OUT(3, 2) = OUT(2, 2) =
OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t *dst) { // Horizontal-Down
@@ -394,23 +390,24 @@ static void HD4(uint8_t *dst) { // Horizontal-Down
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
OUT(0, 3) = AVG2(L, K);
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
OUT(3, 0) = AVG3(A, B, C);
OUT(2, 0) = AVG3(X, A, B);
OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
OUT(1, 3) = AVG3(L, K, J);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
#undef DST
#undef AVG3
#undef AVG2
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Chroma
static void VE8uv(uint8_t *dst) { // vertical
@@ -467,24 +464,24 @@ static void DC8uvNoTopLeft(uint8_t *dst) { // DC with nothing
Put8x8uv(0x8080808080808080ULL, dst);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// default C implementations
VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
VP8PredFunc VP8PredLuma4[/* NUM_BMODES */] = {
DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
};
VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
VP8PredFunc VP8PredLuma16[/*NUM_B_DC_MODES */] = {
DC16, TM16, VE16, HE16,
DC16NoTop, DC16NoLeft, DC16NoTopLeft
};
VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
VP8PredFunc VP8PredChroma8[/*NUM_B_DC_MODES */] = {
DC8uv, TM8uv, VE8uv, HE8uv,
DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Edge filtering functions
// 4 pixels in, 2 pixels out
@@ -546,7 +543,7 @@ static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
@@ -583,7 +580,7 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
@@ -669,26 +666,62 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
void (*VP8VFilter16)(uint8_t*, int, int, int, int) = VFilter16;
void (*VP8HFilter16)(uint8_t*, int, int, int, int) = HFilter16;
void (*VP8VFilter8)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8;
void (*VP8HFilter8)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8;
void (*VP8VFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
void (*VP8HFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
void (*VP8VFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i;
void (*VP8HFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
VP8DecIdct2 VP8Transform;
VP8DecIdct VP8TransformUV;
VP8DecIdct VP8TransformDC;
VP8DecIdct VP8TransformDCUV;
void (*VP8SimpleVFilter16)(uint8_t*, int, int) = SimpleVFilter16;
void (*VP8SimpleHFilter16)(uint8_t*, int, int) = SimpleHFilter16;
void (*VP8SimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
void (*VP8SimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
VP8LumaFilterFunc VP8VFilter16;
VP8LumaFilterFunc VP8HFilter16;
VP8ChromaFilterFunc VP8VFilter8;
VP8ChromaFilterFunc VP8HFilter8;
VP8LumaFilterFunc VP8VFilter16i;
VP8LumaFilterFunc VP8HFilter16i;
VP8ChromaFilterFunc VP8VFilter8i;
VP8ChromaFilterFunc VP8HFilter8i;
VP8SimpleFilterFunc VP8SimpleVFilter16;
VP8SimpleFilterFunc VP8SimpleHFilter16;
VP8SimpleFilterFunc VP8SimpleVFilter16i;
VP8SimpleFilterFunc VP8SimpleHFilter16i;
//-----------------------------------------------------------------------------
extern void VP8DspInitSSE2(void);
extern void VP8DspInitNEON(void);
void VP8DspInit(void) {
// later we'll plug some SSE2 variant here
DspInitTables();
VP8Transform = TransformTwo;
VP8TransformUV = TransformUV;
VP8TransformDC = TransformDC;
VP8TransformDCUV = TransformDCUV;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo) {
#if defined(__SSE2__) || defined(_MSC_VER)
if (VP8GetCPUInfo(kSSE2)) {
VP8DspInitSSE2();
}
#elif defined(__GNUC__) && defined(__ARM_NEON__)
if (VP8GetCPUInfo(kNEON)) {
VP8DspInitNEON();
}
#endif
}
}
#if defined(__cplusplus) || defined(c_plusplus)

168
src/dsp/dec_neon.c Normal file
View File

@@ -0,0 +1,168 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// ARM NEON version of dsp functions and loop filtering.
//
// Author: somnath@google.com (Somnath Banerjee)
#if defined(__GNUC__) && defined(__ARM_NEON__)
#include "../dec/vp8i.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
#define FLIP_SIGN_BIT2(a, b, s) \
"veor " #a "," #a "," #s " \n" \
"veor " #b "," #b "," #s " \n" \
#define FLIP_SIGN_BIT4(a, b, c, d, s) \
FLIP_SIGN_BIT2(a, b, s) \
FLIP_SIGN_BIT2(c, d, s) \
#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
"vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
"vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
"vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
"vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
"vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
"vdup.8 q14, " #thresh " \n" \
"vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
#define GET_BASE_DELTA(p1, p0, q0, q1, o) \
"vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
"vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
#define DO_SIMPLE_FILTER(p0, q0, fl) \
"vmov.i8 q15, #0x03 \n" \
"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
"vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
"vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
\
"vmov.i8 q15, #0x04 \n" \
"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
"vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
"vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
// Applies filter on 2 pixels (p0 and q0)
#define DO_FILTER2(p1, p0, q0, q1, thresh) \
NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
"vmov.i8 q10, #0x80 \n" /* sign bit */ \
FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
"vand q9, q9, q11 \n" /* apply filter mask */ \
DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
FLIP_SIGN_BIT2(p0, q0, q10)
// Load/Store vertical edge
#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
"vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
"vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
"vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
"vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
"vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
"vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
"vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
"vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
#define STORE8x2(c1, c2, p,stride) \
"vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \
"vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \
"vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \
"vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \
"vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \
"vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \
"vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \
"vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
//-----------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
"vld1.u8 {q4}, [%[p]] \n" // q1
DO_FILTER2(q1, q2, q3, q4, %[thresh])
"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
"vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
"vst1.u8 {q3}, [%[p]] \n" // store oq0
: [p] "+r"(p)
: [stride] "r"(stride), [thresh] "r"(thresh)
: "memory", QRegs
);
}
static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
__asm__ volatile (
"sub r4, %[p], #2 \n" // base1 = p - 2
"lsl r6, %[stride], #1 \n" // r6 = 2 * stride
"add r5, r4, %[stride] \n" // base2 = base1 + stride
LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
"vswp d3, d6 \n" // p1:q1 p0:q3
"vswp d5, d8 \n" // q0:q2 q1:q4
"vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4
DO_FILTER2(q1, q2, q3, q4, %[thresh])
"sub %[p], %[p], #1 \n" // p - 1
"vswp d5, d6 \n"
STORE8x2(d4, d5, [%[p]], %[stride])
STORE8x2(d6, d7, [%[p]], %[stride])
: [p] "+r"(p)
: [stride] "r"(stride), [thresh] "r"(thresh)
: "memory", "r4", "r5", "r6", QRegs
);
}
static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16NEON(p, stride, thresh);
}
}
static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16NEON(p, stride, thresh);
}
}
extern void VP8DspInitNEON(void);
void VP8DspInitNEON(void) {
VP8SimpleVFilter16 = SimpleVFilter16NEON;
VP8SimpleHFilter16 = SimpleHFilter16NEON;
VP8SimpleVFilter16i = SimpleVFilter16iNEON;
VP8SimpleHFilter16i = SimpleHFilter16iNEON;
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // __GNUC__ && __ARM_NEON__

898
src/dsp/dec_sse2.c Normal file
View File

@@ -0,0 +1,898 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// SSE2 version of some decoding functions (idct, loop filtering).
//
// Author: somnath@google.com (Somnath Banerjee)
// cduvivier@google.com (Christian Duvivier)
#if defined(__SSE2__) || defined(_MSC_VER)
#include <emmintrin.h>
#include "../dec/vp8i.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
// K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
//
// To be able to use signed 16-bit integers, we use the following trick to
// have constants within range:
// - Associated constants are obtained by subtracting the 16-bit fixed point
// version of one:
// k = K - (1 << 16) => K = k + (1 << 16)
// K1 = 85267 => k1 = 20091
// K2 = 35468 => k2 = -30068
// - The multiplication of a variable by a constant become the sum of the
// variable and the multiplication of that variable by the associated
// constant:
// (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
const __m128i k1 = _mm_set1_epi16(20091);
const __m128i k2 = _mm_set1_epi16(-30068);
__m128i T0, T1, T2, T3;
// Load and concatenate the transform coefficients (we'll do two transforms
// in parallel). In the case of only one transform, the second half of the
// vectors will just contain random value we'll never use nor store.
__m128i in0, in1, in2, in3;
{
in0 = _mm_loadl_epi64((__m128i*)&in[0]);
in1 = _mm_loadl_epi64((__m128i*)&in[4]);
in2 = _mm_loadl_epi64((__m128i*)&in[8]);
in3 = _mm_loadl_epi64((__m128i*)&in[12]);
// a00 a10 a20 a30 x x x x
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x
if (do_two) {
const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
in0 = _mm_unpacklo_epi64(in0, inB0);
in1 = _mm_unpacklo_epi64(in1, inB1);
in2 = _mm_unpacklo_epi64(in2, inB2);
in3 = _mm_unpacklo_epi64(in3, inB3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
}
// Vertical pass and subsequent transpose.
{
// First pass, c and d calculations are longer because of the "trick"
// multiplications.
const __m128i a = _mm_add_epi16(in0, in2);
const __m128i b = _mm_sub_epi16(in0, in2);
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
const __m128i c1 = _mm_mulhi_epi16(in1, k2);
const __m128i c2 = _mm_mulhi_epi16(in3, k1);
const __m128i c3 = _mm_sub_epi16(in1, in3);
const __m128i c4 = _mm_sub_epi16(c1, c2);
const __m128i c = _mm_add_epi16(c3, c4);
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
const __m128i d1 = _mm_mulhi_epi16(in1, k1);
const __m128i d2 = _mm_mulhi_epi16(in3, k2);
const __m128i d3 = _mm_add_epi16(in1, in3);
const __m128i d4 = _mm_add_epi16(d1, d2);
const __m128i d = _mm_add_epi16(d3, d4);
// Second pass.
const __m128i tmp0 = _mm_add_epi16(a, d);
const __m128i tmp1 = _mm_add_epi16(b, c);
const __m128i tmp2 = _mm_sub_epi16(b, c);
const __m128i tmp3 = _mm_sub_epi16(a, d);
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Horizontal pass and subsequent transpose.
{
// First pass, c and d calculations are longer because of the "trick"
// multiplications.
const __m128i four = _mm_set1_epi16(4);
const __m128i dc = _mm_add_epi16(T0, four);
const __m128i a = _mm_add_epi16(dc, T2);
const __m128i b = _mm_sub_epi16(dc, T2);
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
const __m128i c1 = _mm_mulhi_epi16(T1, k2);
const __m128i c2 = _mm_mulhi_epi16(T3, k1);
const __m128i c3 = _mm_sub_epi16(T1, T3);
const __m128i c4 = _mm_sub_epi16(c1, c2);
const __m128i c = _mm_add_epi16(c3, c4);
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
const __m128i d1 = _mm_mulhi_epi16(T1, k1);
const __m128i d2 = _mm_mulhi_epi16(T3, k2);
const __m128i d3 = _mm_add_epi16(T1, T3);
const __m128i d4 = _mm_add_epi16(d1, d2);
const __m128i d = _mm_add_epi16(d3, d4);
// Second pass.
const __m128i tmp0 = _mm_add_epi16(a, d);
const __m128i tmp1 = _mm_add_epi16(b, c);
const __m128i tmp2 = _mm_sub_epi16(b, c);
const __m128i tmp3 = _mm_sub_epi16(a, d);
const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Add inverse transform to 'dst' and store.
{
const __m128i zero = _mm_set1_epi16(0);
// Load the reference(s).
__m128i dst0, dst1, dst2, dst3;
if (do_two) {
// Load eight bytes/pixels per line.
dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
} else {
// Load four bytes/pixels per line.
dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
}
// Convert to 16b.
dst0 = _mm_unpacklo_epi8(dst0, zero);
dst1 = _mm_unpacklo_epi8(dst1, zero);
dst2 = _mm_unpacklo_epi8(dst2, zero);
dst3 = _mm_unpacklo_epi8(dst3, zero);
// Add the inverse transform(s).
dst0 = _mm_add_epi16(dst0, T0);
dst1 = _mm_add_epi16(dst1, T1);
dst2 = _mm_add_epi16(dst2, T2);
dst3 = _mm_add_epi16(dst3, T3);
// Unsigned saturate to 8b.
dst0 = _mm_packus_epi16(dst0, dst0);
dst1 = _mm_packus_epi16(dst1, dst1);
dst2 = _mm_packus_epi16(dst2, dst2);
dst3 = _mm_packus_epi16(dst3, dst3);
// Store the results.
if (do_two) {
// Store eight bytes/pixels per line.
_mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
_mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
_mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
_mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
} else {
// Store four bytes/pixels per line.
*((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
*((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
*((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
*((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
}
}
}
//------------------------------------------------------------------------------
// Loop Filter (Paragraph 15)
// Compute abs(p - q) = subs(p - q) OR subs(q - p)
#define MM_ABS(p, q) _mm_or_si128( \
_mm_subs_epu8((q), (p)), \
_mm_subs_epu8((p), (q)))
// Shift each byte of "a" by N bits while preserving by the sign bit.
//
// It first shifts the lower bytes of the words and then the upper bytes and
// then merges the results together.
#define SIGNED_SHIFT_N(a, N) { \
__m128i t = a; \
t = _mm_slli_epi16(t, 8); \
t = _mm_srai_epi16(t, N); \
t = _mm_srli_epi16(t, 8); \
\
a = _mm_srai_epi16(a, N + 8); \
a = _mm_slli_epi16(a, 8); \
\
a = _mm_or_si128(t, a); \
}
#define FLIP_SIGN_BIT2(a, b) { \
a = _mm_xor_si128(a, sign_bit); \
b = _mm_xor_si128(b, sign_bit); \
}
#define FLIP_SIGN_BIT4(a, b, c, d) { \
FLIP_SIGN_BIT2(a, b); \
FLIP_SIGN_BIT2(c, d); \
}
#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \
const __m128i zero = _mm_setzero_si128(); \
const __m128i t1 = MM_ABS(p1, p0); \
const __m128i t2 = MM_ABS(q1, q0); \
\
const __m128i h = _mm_set1_epi8(hev_thresh); \
const __m128i t3 = _mm_subs_epu8(t1, h); /* abs(p1 - p0) - hev_tresh */ \
const __m128i t4 = _mm_subs_epu8(t2, h); /* abs(q1 - q0) - hev_tresh */ \
\
not_hev = _mm_or_si128(t3, t4); \
not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
}
#define GET_BASE_DELTA(p1, p0, q0, q1, o) { \
const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \
o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \
o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \
o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \
o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \
}
#define DO_SIMPLE_FILTER(p0, q0, fl) { \
const __m128i three = _mm_set1_epi8(3); \
const __m128i four = _mm_set1_epi8(4); \
__m128i v3 = _mm_adds_epi8(fl, three); \
__m128i v4 = _mm_adds_epi8(fl, four); \
\
/* Do +4 side */ \
SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \
q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \
\
/* Now do +3 side */ \
SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \
p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \
}
// Updates values of 2 pixels at MB edge during complex filtering.
// Update operations:
// q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)]
#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \
const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \
const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \
const __m128i a = _mm_packs_epi16(a_lo7, a_hi7); \
pi = _mm_adds_epi8(pi, a); \
qi = _mm_subs_epi8(qi, a); \
}
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
const __m128i* q1, int thresh, __m128i *mask) {
__m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
*mask = _mm_set1_epi8(0xFE);
t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero
t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2
*mask = MM_ABS(*p0, *q0); // abs(p0 - q0)
*mask = _mm_adds_epu8(*mask, *mask); // abs(p0 - q0) * 2
*mask = _mm_adds_epu8(*mask, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
t1 = _mm_set1_epi8(thresh);
*mask = _mm_subs_epu8(*mask, t1); // mask <= thresh
*mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
}
//------------------------------------------------------------------------------
// Edge filtering functions
// Applies filter on 2 pixels (p0 and q0)
static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
const __m128i* q1, int thresh) {
__m128i a, mask;
const __m128i sign_bit = _mm_set1_epi8(0x80);
const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
NeedsFilter(p1, p0, q0, q1, thresh, &mask);
// convert to signed values
FLIP_SIGN_BIT2(*p0, *q0);
GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
a = _mm_and_si128(a, mask); // mask filter values we don't care about
DO_SIMPLE_FILTER(*p0, *q0, a);
// unoffset
FLIP_SIGN_BIT2(*p0, *q0);
}
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
const __m128i* mask, int hev_thresh) {
__m128i not_hev;
__m128i t1, t2, t3;
const __m128i sign_bit = _mm_set1_epi8(0x80);
// compute hev mask
GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
// convert to signed values
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1
t1 = _mm_andnot_si128(not_hev, t1); // hev(p1 - q1)
t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0
t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0)
t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0)
t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0)
t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about
// Do +4 side
t2 = _mm_set1_epi8(4);
t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4
SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
t3 = t2; // save t2
*q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2
// Now do +3 side
t2 = _mm_set1_epi8(3);
t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4
SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
t2 = _mm_set1_epi8(1);
t3 = _mm_adds_epi8(t3, t2);
SIGNED_SHIFT_N(t3, 1); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
t3 = _mm_and_si128(not_hev, t3); // if !hev
*q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3
*p1 = _mm_adds_epi8(*p1, t3); // p1 += t3
// unoffset
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
}
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
__m128i* q0, __m128i* q1, __m128i *q2,
const __m128i* mask, int hev_thresh) {
__m128i a, not_hev;
const __m128i sign_bit = _mm_set1_epi8(0x80);
// compute hev mask
GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
// convert to signed values
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
FLIP_SIGN_BIT2(*p2, *q2);
GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);
{ // do simple filter on pixels with hev
const __m128i m = _mm_andnot_si128(not_hev, *mask);
const __m128i f = _mm_and_si128(a, m);
DO_SIMPLE_FILTER(*p0, *q0, f);
}
{ // do strong filter on pixels with not hev
const __m128i zero = _mm_setzero_si128();
const __m128i nine = _mm_set1_epi16(0x0900);
const __m128i sixty_three = _mm_set1_epi16(63);
const __m128i m = _mm_and_si128(not_hev, *mask);
const __m128i f = _mm_and_si128(a, m);
const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine); // Filter (lo) * 9
const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine); // Filter (hi) * 9
const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo); // Filter (lo) * 18
const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi); // Filter (hi) * 18
const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three); // Filter * 9 + 63
const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three); // Filter * 9 + 63
const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three); // F... * 18 + 63
const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three); // F... * 18 + 63
const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo); // Filter * 27 + 63
const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi); // Filter * 27 + 63
UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
}
// unoffset
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
FLIP_SIGN_BIT2(*p2, *q2);
}
// reads 8 rows across a vertical edge.
//
// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
// two Load4x4() to avoid code duplication.
static inline void Load8x4(const uint8_t* b, int stride,
__m128i* p, __m128i* q) {
__m128i t1, t2;
// Load 0th, 1st, 4th and 5th rows
__m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00
__m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10
__m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40
__m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50
r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00
r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10
// t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
t1 = _mm_unpacklo_epi8(r0, r1);
// Load 2nd, 3rd, 6th and 7th rows
r0 = _mm_cvtsi32_si128(*((int*)&b[2 * stride])); // 23 22 21 22
r1 = _mm_cvtsi32_si128(*((int*)&b[3 * stride])); // 33 32 31 30
r4 = _mm_cvtsi32_si128(*((int*)&b[6 * stride])); // 63 62 61 60
r5 = _mm_cvtsi32_si128(*((int*)&b[7 * stride])); // 73 72 71 70
r0 = _mm_unpacklo_epi32(r0, r4); // 63 62 61 60 23 22 21 20
r1 = _mm_unpacklo_epi32(r1, r5); // 73 72 71 70 33 32 31 30
// t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
t2 = _mm_unpacklo_epi8(r0, r1);
// t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
// t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
r0 = t1;
t1 = _mm_unpacklo_epi16(t1, t2);
t2 = _mm_unpackhi_epi16(r0, t2);
// *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
// *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
*p = _mm_unpacklo_epi32(t1, t2);
*q = _mm_unpackhi_epi32(t1, t2);
}
static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
__m128i* p1, __m128i* p0,
__m128i* q0, __m128i* q1) {
__m128i t1, t2;
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
// ... | ...
// e0 e1 | e2 e3
// f0 f1 | f2 f3
//
// r0 is pointing to the 0th row (00)
// r8 is pointing to the 8th row (80)
// Load
// p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
Load8x4(r0, stride, p1, q0);
Load8x4(r8, stride, p0, q1);
t1 = *p1;
t2 = *q0;
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
// p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
// q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
// q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
*p1 = _mm_unpacklo_epi64(t1, *p0);
*p0 = _mm_unpackhi_epi64(t1, *p0);
*q0 = _mm_unpacklo_epi64(t2, *q1);
*q1 = _mm_unpackhi_epi64(t2, *q1);
}
static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
int i;
for (i = 0; i < 4; ++i, dst += stride) {
*((int32_t*)dst) = _mm_cvtsi128_si32(*x);
*x = _mm_srli_si128(*x, 4);
}
}
// Transpose back and store
static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,
__m128i* p0, __m128i* q0, __m128i* q1) {
__m128i t1;
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
// p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
t1 = *p0;
*p0 = _mm_unpacklo_epi8(*p1, t1);
*p1 = _mm_unpackhi_epi8(*p1, t1);
// q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
// q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
t1 = *q0;
*q0 = _mm_unpacklo_epi8(t1, *q1);
*q1 = _mm_unpackhi_epi8(t1, *q1);
// p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
// q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
t1 = *p0;
*p0 = _mm_unpacklo_epi16(t1, *q0);
*q0 = _mm_unpackhi_epi16(t1, *q0);
// p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
// q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
t1 = *p1;
*p1 = _mm_unpacklo_epi16(t1, *q1);
*q1 = _mm_unpackhi_epi16(t1, *q1);
Store4x4(p0, r0, stride);
r0 += 4 * stride;
Store4x4(q0, r0, stride);
Store4x4(p1, r8, stride);
r8 += 4 * stride;
Store4x4(q1, r8, stride);
}
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
// Load
__m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
__m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
__m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
__m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-stride], p0);
_mm_storeu_si128((__m128i*)p, q0);
}
static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
__m128i p1, p0, q0, q1;
p -= 2; // beginning of p1
Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
}
static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16SSE2(p, stride, thresh);
}
}
static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16SSE2(p, stride, thresh);
}
}
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
#define MAX_DIFF1(p3, p2, p1, p0, m) { \
m = MM_ABS(p3, p2); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
}
#define MAX_DIFF2(p3, p2, p1, p0, m) { \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
}
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \
e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \
e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
}
#define LOADUV_H_EDGE(p, u, v, stride) { \
p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)])); \
}
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
LOADUV_H_EDGE(e1, u, v, 0 * stride); \
LOADUV_H_EDGE(e2, u, v, 1 * stride); \
LOADUV_H_EDGE(e3, u, v, 2 * stride); \
LOADUV_H_EDGE(e4, u, v, 3 * stride); \
}
#define STOREUV(p, u, v, stride) { \
_mm_storel_epi64((__m128i*)&u[(stride)], p); \
p = _mm_srli_si128(p, 8); \
_mm_storel_epi64((__m128i*)&v[(stride)], p); \
}
#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) { \
__m128i fl_yes; \
const __m128i it = _mm_set1_epi8(ithresh); \
mask = _mm_subs_epu8(mask, it); \
mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); \
NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes); \
mask = _mm_and_si128(mask, fl_yes); \
}
// on macroblock edges
static void VFilter16SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i t1;
__m128i mask;
__m128i p2, p1, p0, q0, q1, q2;
// Load p3, p2, p1, p0
LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
MAX_DIFF1(t1, p2, p1, p0, mask);
// Load q0, q1, q2, q3
LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
_mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
_mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
_mm_storeu_si128((__m128i*)&p[0 * stride], q0);
_mm_storeu_si128((__m128i*)&p[1 * stride], q1);
_mm_storeu_si128((__m128i*)&p[2 * stride], q2);
}
static void HFilter16SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const b = p - 4;
Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(p3, p2, p1, p0, mask);
Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
MAX_DIFF2(q3, q2, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
}
// on three inner edges
static void VFilter16iSSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
for (k = 3; k > 0; --k) {
// Load p3, p2, p1, p0
LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
MAX_DIFF1(t2, t1, p1, p0, mask);
p += 4 * stride;
// Load q0, q1, q2, q3
LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
MAX_DIFF2(t2, t1, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
_mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
_mm_storeu_si128((__m128i*)&p[0 * stride], q0);
_mm_storeu_si128((__m128i*)&p[1 * stride], q1);
}
}
static void HFilter16iSSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
uint8_t* b;
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
for (k = 3; k > 0; --k) {
b = p;
Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(t2, t1, p1, p0, mask);
b += 4; // beginning of q0
Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
b -= 2; // beginning of p1
Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
p += 4;
}
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2;
// Load p3, p2, p1, p0
LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0);
MAX_DIFF1(t1, p2, p1, p0, mask);
// Load q0, q1, q2, q3
LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
STOREUV(p2, u, v, -3 * stride);
STOREUV(p1, u, v, -2 * stride);
STOREUV(p0, u, v, -1 * stride);
STOREUV(q0, u, v, 0 * stride);
STOREUV(q1, u, v, 1 * stride);
STOREUV(q2, u, v, 2 * stride);
}
static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const tu = u - 4;
uint8_t* const tv = v - 4;
Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(p3, p2, p1, p0, mask);
Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
MAX_DIFF2(q3, q2, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
}
static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
// Load p3, p2, p1, p0
LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0);
MAX_DIFF1(t2, t1, p1, p0, mask);
u += 4 * stride;
v += 4 * stride;
// Load q0, q1, q2, q3
LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
MAX_DIFF2(t2, t1, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
STOREUV(p1, u, v, -2 * stride);
STOREUV(p0, u, v, -1 * stride);
STOREUV(q0, u, v, 0 * stride);
STOREUV(q1, u, v, 1 * stride);
}
static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(t2, t1, p1, p0, mask);
u += 4; // beginning of q0
v += 4;
Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
u -= 2; // beginning of p1
v -= 2;
Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
}
extern void VP8DspInitSSE2(void);
void VP8DspInitSSE2(void) {
VP8Transform = TransformSSE2;
VP8VFilter16 = VFilter16SSE2;
VP8HFilter16 = HFilter16SSE2;
VP8VFilter8 = VFilter8SSE2;
VP8HFilter8 = HFilter8SSE2;
VP8VFilter16i = VFilter16iSSE2;
VP8HFilter16i = HFilter16iSSE2;
VP8VFilter8i = VFilter8iSSE2;
VP8HFilter8i = HFilter8iSSE2;
VP8SimpleVFilter16 = SimpleVFilter16SSE2;
VP8SimpleHFilter16 = SimpleHFilter16SSE2;
VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif //__SSE2__ || _MSC_VER

175
src/dsp/dsp.h Normal file
View File

@@ -0,0 +1,175 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Speed-critical functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DSP_DSP_H_
#define WEBP_DSP_DSP_H_
#include "../webp/types.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// CPU detection
typedef enum {
kSSE2,
kSSE3,
kNEON
} CPUFeature;
// returns true if the CPU supports the feature.
typedef int (*VP8CPUInfo)(CPUFeature feature);
extern VP8CPUInfo VP8GetCPUInfo;
//------------------------------------------------------------------------------
// Encoding
int VP8GetAlpha(const int histo[]);
// Transforms
// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
// will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two);
typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
extern VP8Idct VP8ITransform;
extern VP8Fdct VP8FTransform;
extern VP8WHT VP8ITransformWHT;
extern VP8WHT VP8FTransformWHT;
// Predictions
// *dst is the destination block. *top and *left can be NULL.
typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
const uint8_t* top);
typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
extern VP8Intra4Preds VP8EncPredLuma4;
extern VP8IntraPreds VP8EncPredLuma16;
extern VP8IntraPreds VP8EncPredChroma8;
typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
const uint16_t* const weights);
extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
extern VP8BlockCopy VP8Copy4x4;
extern VP8BlockCopy VP8Copy8x8;
extern VP8BlockCopy VP8Copy16x16;
// Quantization
struct VP8Matrix; // forward declaration
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
int n, const struct VP8Matrix* const mtx);
extern VP8QuantizeBlock VP8EncQuantizeBlock;
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block);
extern const int VP8DspScan[16 + 4 + 4];
extern VP8CHisto VP8CollectHistogram;
void VP8EncDspInit(void); // must be called before using any of the above
//------------------------------------------------------------------------------
// Decoding
typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
// when doing two transforms, coeffs is actually int16_t[2][16].
typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
extern VP8DecIdct2 VP8Transform;
extern VP8DecIdct VP8TransformUV;
extern VP8DecIdct VP8TransformDC;
extern VP8DecIdct VP8TransformDCUV;
extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
// *dst is the destination block, with stride BPS. Boundary samples are
// assumed accessible when needed.
typedef void (*VP8PredFunc)(uint8_t* dst);
extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
// simple filter (only for luma)
typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
extern VP8SimpleFilterFunc VP8SimpleVFilter16;
extern VP8SimpleFilterFunc VP8SimpleHFilter16;
extern VP8SimpleFilterFunc VP8SimpleVFilter16i; // filter 3 inner edges
extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
// regular filter (on both macroblock edges and inner edges)
typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
int thresh, int ithresh, int hev_t);
typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_t);
// on outer edge
extern VP8LumaFilterFunc VP8VFilter16;
extern VP8LumaFilterFunc VP8HFilter16;
extern VP8ChromaFilterFunc VP8VFilter8;
extern VP8ChromaFilterFunc VP8HFilter8;
// on inner edge
extern VP8LumaFilterFunc VP8VFilter16i; // filtering 3 inner edges altogether
extern VP8LumaFilterFunc VP8HFilter16i;
extern VP8ChromaFilterFunc VP8VFilter8i; // filtering u and v altogether
extern VP8ChromaFilterFunc VP8HFilter8i;
// must be called before anything using the above
extern void VP8DspInit(void);
//------------------------------------------------------------------------------
// WebP I/O
#define FANCY_UPSAMPLING // undefined to remove fancy upsampling support
#ifdef FANCY_UPSAMPLING
typedef void (*WebPUpsampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len);
// Fancy upsampling functions to convert YUV to RGB(A) modes
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */];
// Initializes SSE2 version of the fancy upsamplers.
void WebPInitUpsamplersSSE2(void);
#endif // FANCY_UPSAMPLING
// Point-sampling methods.
typedef void (*WebPSampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* u, const uint8_t* v,
uint8_t* top_dst, uint8_t* bottom_dst, int len);
extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];
// YUV444->RGB converters
typedef void (*WebPYUV444Converter)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
// Main function to be called
void WebPInitUpsamplers(void);
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif /* WEBP_DSP_DSP_H_ */

744
src/dsp/enc.c Normal file
View File

@@ -0,0 +1,744 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Speed-critical encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "../enc/vp8enci.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int ClipAlpha(int alpha) {
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
}
int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
int num = 0, den = 0, val = 0;
int k;
int alpha;
// note: changing this loop to avoid the numerous "k + 1" slows things down.
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
if (histo[k + 1]) {
val += histo[k + 1];
num += val * (k + 1);
den += (k + 1) * (k + 1);
}
}
// we scale the value to a usable [0..255] range
alpha = den ? 10 * num / den - 5 : 0;
return ClipAlpha(alpha);
}
const int VP8DspScan[16 + 4 + 4] = {
// Luma
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
};
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH + 1] = { 0 };
int16_t out[16];
int j, k;
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 2;
out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
}
// Use bin to update histogram.
for (k = 0; k < 16; ++k) {
histo[out[k]]++;
}
}
return VP8GetAlpha(histo);
}
//------------------------------------------------------------------------------
// run-time tables (~4k)
static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
// We declare this variable 'volatile' to prevent instruction reordering
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
static void InitTables(void) {
if (!tables_ok) {
int i;
for (i = -255; i <= 255 + 255; ++i) {
clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
}
tables_ok = 1;
}
}
static inline uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static inline void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
for (i = 0; i < 4; ++i) { // vertical pass
const int a = in[0] + in[8];
const int b = in[0] - in[8];
const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
tmp[0] = a + d;
tmp[1] = b + c;
tmp[2] = b - c;
tmp[3] = a - d;
tmp += 4;
in++;
}
tmp = C;
for (i = 0; i < 4; ++i) { // horizontal pass
const int dc = tmp[0] + 4;
const int a = dc + tmp[8];
const int b = dc - tmp[8];
const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
STORE(0, i, a + d);
STORE(1, i, b + c);
STORE(2, i, b - c);
STORE(3, i, a - d);
tmp++;
}
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
const int d0 = src[0] - ref[0];
const int d1 = src[1] - ref[1];
const int d2 = src[2] - ref[2];
const int d3 = src[3] - ref[3];
const int a0 = (d0 + d3) << 3;
const int a1 = (d1 + d2) << 3;
const int a2 = (d1 - d2) << 3;
const int a3 = (d0 - d3) << 3;
tmp[0 + i * 4] = (a0 + a1);
tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
tmp[2 + i * 4] = (a0 - a1);
tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[12 + i]);
const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
const int a3 = (tmp[0 + i] - tmp[12 + i]);
out[0 + i] = (a0 + a1 + 7) >> 4;
out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
out[8 + i] = (a0 - a1 + 7) >> 4;
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
static void ITransformWHT(const int16_t* in, int16_t* out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i) {
const int a0 = in[0 + i] + in[12 + i];
const int a1 = in[4 + i] + in[ 8 + i];
const int a2 = in[4 + i] - in[ 8 + i];
const int a3 = in[0 + i] - in[12 + i];
tmp[0 + i] = a0 + a1;
tmp[8 + i] = a0 - a1;
tmp[4 + i] = a3 + a2;
tmp[12 + i] = a3 - a2;
}
for (i = 0; i < 4; ++i) {
const int dc = tmp[0 + i * 4] + 3; // w/ rounder
const int a0 = dc + tmp[3 + i * 4];
const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
const int a3 = dc - tmp[3 + i * 4];
out[ 0] = (a0 + a1) >> 3;
out[16] = (a3 + a2) >> 3;
out[32] = (a0 - a1) >> 3;
out[48] = (a3 - a2) >> 3;
out += 64;
}
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i, in += 64) {
const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[8 + i]);
const int a1 = (tmp[4 + i] + tmp[12+ i]);
const int a2 = (tmp[4 + i] - tmp[12+ i]);
const int a3 = (tmp[0 + i] - tmp[8 + i]);
const int b0 = a0 + a1;
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
}
}
#undef MUL
#undef STORE
//------------------------------------------------------------------------------
// Intra predictions
#define DST(x, y) dst[(x) + (y) * BPS]
static inline void Fill(uint8_t* dst, int value, int size) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, value, size);
}
}
static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
int j;
if (top) {
for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
} else {
Fill(dst, 127, size);
}
}
static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
if (left) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, left[j], size);
}
} else {
Fill(dst, 129, size);
}
}
static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
int y;
if (left) {
if (top) {
const uint8_t* const clip = clip1 + 255 - left[-1];
for (y = 0; y < size; ++y) {
const uint8_t* const clip_table = clip + left[y];
int x;
for (x = 0; x < size; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
} else {
HorizontalPred(dst, left, size);
}
} else {
// true motion without left samples (hence: with default 129 value)
// is equivalent to VE prediction where you just copy the top samples.
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
if (top) {
VerticalPred(dst, top, size);
} else {
Fill(dst, 129, size);
}
}
}
static inline void DCMode(uint8_t* dst, const uint8_t* left,
const uint8_t* top,
int size, int round, int shift) {
int DC = 0;
int j;
if (top) {
for (j = 0; j < size; ++j) DC += top[j];
if (left) { // top and left present
for (j = 0; j < size; ++j) DC += left[j];
} else { // top, but no left
DC += DC;
}
DC = (DC + round) >> shift;
} else if (left) { // left but no top
for (j = 0; j < size; ++j) DC += left[j];
DC += DC;
DC = (DC + round) >> shift;
} else { // no top, no left, nothing.
DC = 0x80;
}
Fill(dst, DC, size);
}
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
if (top) top += 8;
if (left) left += 16;
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
}
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// luma 4x4 prediction
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
AVG3(top[ 0], top[1], top[2]),
AVG3(top[ 1], top[2], top[3]),
AVG3(top[ 2], top[3], top[4])
};
int i;
for (i = 0; i < 4; ++i) {
memcpy(dst + i * BPS, vals, 4);
}
}
static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
*(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
*(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
*(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
*(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
}
static void DC4(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4);
}
static void RD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 3) = AVG3(J, K, L);
DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
static void VR4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
static void TM4(uint8_t* dst, const uint8_t* top) {
int x, y;
const uint8_t* const clip = clip1 + 255 - top[-1];
for (y = 0; y < 4; ++y) {
const uint8_t* const clip_table = clip + top[-2 - y];
for (x = 0; x < 4; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
}
#undef DST
#undef AVG3
#undef AVG2
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
}
//------------------------------------------------------------------------------
// Metric
static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
int count = 0;
int y, x;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
const int diff = (int)a[x] - b[x];
count += diff * diff;
}
a += BPS;
b += BPS;
}
return count;
}
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 16);
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 8);
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 8, 8);
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
static int TTransform(const uint8_t* in, const uint16_t* w) {
int sum = 0;
int tmp[16];
int i;
// horizontal pass
for (i = 0; i < 4; ++i, in += BPS) {
const int a0 = (in[0] + in[2]) << 2;
const int a1 = (in[1] + in[3]) << 2;
const int a2 = (in[1] - in[3]) << 2;
const int a3 = (in[0] - in[2]) << 2;
tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
// vertical pass
for (i = 0; i < 4; ++i, ++w) {
const int a0 = (tmp[0 + i] + tmp[8 + i]);
const int a1 = (tmp[4 + i] + tmp[12+ i]);
const int a2 = (tmp[4 + i] - tmp[12+ i]);
const int a3 = (tmp[0 + i] - tmp[8 + i]);
const int b0 = a0 + a1;
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
// abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
sum += w[ 0] * ((abs(b0) + 3) >> 3);
sum += w[ 4] * ((abs(b1) + 3) >> 3);
sum += w[ 8] * ((abs(b2) + 3) >> 3);
sum += w[12] * ((abs(b3) + 3) >> 3);
}
return sum;
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return (abs(sum2 - sum1) + 8) >> 4;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Quantization
//
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
// Simple quantization
static int QuantizeBlock(int16_t in[16], int16_t out[16],
int n, const VP8Matrix* const mtx) {
int last = -1;
for (; n < 16; ++n) {
const int j = kZigzag[n];
const int sign = (in[j] < 0);
int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
if (coeff > 2047) coeff = 2047;
if (coeff > mtx->zthresh_[j]) {
const int Q = mtx->q_[j];
const int iQ = mtx->iq_[j];
const int B = mtx->bias_[j];
out[n] = QUANTDIV(coeff, iQ, B);
if (sign) out[n] = -out[n];
in[j] = out[n] * Q;
if (out[n]) last = n;
} else {
out[n] = 0;
in[j] = 0;
}
}
return (last >= 0);
}
//------------------------------------------------------------------------------
// Block copy
static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
int y;
for (y = 0; y < size; ++y) {
memcpy(dst, src, size);
src += BPS;
dst += BPS;
}
}
static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }
//------------------------------------------------------------------------------
// Initialization
// Speed-critical function pointers. We have to initialize them to the default
// implementations within VP8EncDspInit().
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
VP8WHT VP8ITransformWHT;
VP8WHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
VP8IntraPreds VP8EncPredChroma8;
VP8Metric VP8SSE16x16;
VP8Metric VP8SSE8x8;
VP8Metric VP8SSE16x8;
VP8Metric VP8SSE4x4;
VP8WMetric VP8TDisto4x4;
VP8WMetric VP8TDisto16x16;
VP8QuantizeBlock VP8EncQuantizeBlock;
VP8BlockCopy VP8Copy4x4;
VP8BlockCopy VP8Copy8x8;
VP8BlockCopy VP8Copy16x16;
extern void VP8EncDspInitSSE2(void);
void VP8EncDspInit(void) {
InitTables();
// default C implementations
VP8CollectHistogram = CollectHistogram;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8ITransformWHT = ITransformWHT;
VP8FTransformWHT = FTransformWHT;
VP8EncPredLuma4 = Intra4Preds;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8EncQuantizeBlock = QuantizeBlock;
VP8Copy4x4 = Copy4x4;
VP8Copy8x8 = Copy8x8;
VP8Copy16x16 = Copy16x16;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo) {
#if defined(__SSE2__) || defined(_MSC_VER)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspInitSSE2();
}
#endif
}
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

834
src/dsp/enc_sse2.c Normal file
View File

@@ -0,0 +1,834 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// SSE2 version of speed-critical encoding functions.
//
// Author: Christian Duvivier (cduvivier@google.com)
#if defined(__SSE2__) || defined(_MSC_VER)
#include <emmintrin.h>
#include "../enc/vp8enci.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH + 1] = { 0 };
int16_t out[16];
int j, k;
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
{
// Load.
const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
// sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)
const __m128i sign0 = _mm_srai_epi16(out0, 15);
const __m128i sign1 = _mm_srai_epi16(out1, 15);
// abs(out) = (out ^ sign) - sign
const __m128i xor0 = _mm_xor_si128(out0, sign0);
const __m128i xor1 = _mm_xor_si128(out1, sign1);
const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
// v = abs(out) >> 2
const __m128i v0 = _mm_srai_epi16(abs0, 2);
const __m128i v1 = _mm_srai_epi16(abs1, 2);
// bin = min(v, MAX_COEFF_THRESH)
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
// Store.
_mm_storeu_si128((__m128i*)&out[0], bin0);
_mm_storeu_si128((__m128i*)&out[8], bin1);
}
// Use bin to update histogram.
for (k = 0; k < 16; ++k) {
histo[out[k]]++;
}
}
return VP8GetAlpha(histo);
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
// K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
//
// To be able to use signed 16-bit integers, we use the following trick to
// have constants within range:
// - Associated constants are obtained by subtracting the 16-bit fixed point
// version of one:
// k = K - (1 << 16) => K = k + (1 << 16)
// K1 = 85267 => k1 = 20091
// K2 = 35468 => k2 = -30068
// - The multiplication of a variable by a constant become the sum of the
// variable and the multiplication of that variable by the associated
// constant:
// (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
const __m128i k1 = _mm_set1_epi16(20091);
const __m128i k2 = _mm_set1_epi16(-30068);
__m128i T0, T1, T2, T3;
// Load and concatenate the transform coefficients (we'll do two inverse
// transforms in parallel). In the case of only one inverse transform, the
// second half of the vectors will just contain random value we'll never
// use nor store.
__m128i in0, in1, in2, in3;
{
in0 = _mm_loadl_epi64((__m128i*)&in[0]);
in1 = _mm_loadl_epi64((__m128i*)&in[4]);
in2 = _mm_loadl_epi64((__m128i*)&in[8]);
in3 = _mm_loadl_epi64((__m128i*)&in[12]);
// a00 a10 a20 a30 x x x x
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x
if (do_two) {
const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
in0 = _mm_unpacklo_epi64(in0, inB0);
in1 = _mm_unpacklo_epi64(in1, inB1);
in2 = _mm_unpacklo_epi64(in2, inB2);
in3 = _mm_unpacklo_epi64(in3, inB3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
}
// Vertical pass and subsequent transpose.
{
// First pass, c and d calculations are longer because of the "trick"
// multiplications.
const __m128i a = _mm_add_epi16(in0, in2);
const __m128i b = _mm_sub_epi16(in0, in2);
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
const __m128i c1 = _mm_mulhi_epi16(in1, k2);
const __m128i c2 = _mm_mulhi_epi16(in3, k1);
const __m128i c3 = _mm_sub_epi16(in1, in3);
const __m128i c4 = _mm_sub_epi16(c1, c2);
const __m128i c = _mm_add_epi16(c3, c4);
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
const __m128i d1 = _mm_mulhi_epi16(in1, k1);
const __m128i d2 = _mm_mulhi_epi16(in3, k2);
const __m128i d3 = _mm_add_epi16(in1, in3);
const __m128i d4 = _mm_add_epi16(d1, d2);
const __m128i d = _mm_add_epi16(d3, d4);
// Second pass.
const __m128i tmp0 = _mm_add_epi16(a, d);
const __m128i tmp1 = _mm_add_epi16(b, c);
const __m128i tmp2 = _mm_sub_epi16(b, c);
const __m128i tmp3 = _mm_sub_epi16(a, d);
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Horizontal pass and subsequent transpose.
{
// First pass, c and d calculations are longer because of the "trick"
// multiplications.
const __m128i four = _mm_set1_epi16(4);
const __m128i dc = _mm_add_epi16(T0, four);
const __m128i a = _mm_add_epi16(dc, T2);
const __m128i b = _mm_sub_epi16(dc, T2);
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
const __m128i c1 = _mm_mulhi_epi16(T1, k2);
const __m128i c2 = _mm_mulhi_epi16(T3, k1);
const __m128i c3 = _mm_sub_epi16(T1, T3);
const __m128i c4 = _mm_sub_epi16(c1, c2);
const __m128i c = _mm_add_epi16(c3, c4);
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
const __m128i d1 = _mm_mulhi_epi16(T1, k1);
const __m128i d2 = _mm_mulhi_epi16(T3, k2);
const __m128i d3 = _mm_add_epi16(T1, T3);
const __m128i d4 = _mm_add_epi16(d1, d2);
const __m128i d = _mm_add_epi16(d3, d4);
// Second pass.
const __m128i tmp0 = _mm_add_epi16(a, d);
const __m128i tmp1 = _mm_add_epi16(b, c);
const __m128i tmp2 = _mm_sub_epi16(b, c);
const __m128i tmp3 = _mm_sub_epi16(a, d);
const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Add inverse transform to 'ref' and store.
{
const __m128i zero = _mm_set1_epi16(0);
// Load the reference(s).
__m128i ref0, ref1, ref2, ref3;
if (do_two) {
// Load eight bytes/pixels per line.
ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
} else {
// Load four bytes/pixels per line.
ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
}
// Convert to 16b.
ref0 = _mm_unpacklo_epi8(ref0, zero);
ref1 = _mm_unpacklo_epi8(ref1, zero);
ref2 = _mm_unpacklo_epi8(ref2, zero);
ref3 = _mm_unpacklo_epi8(ref3, zero);
// Add the inverse transform(s).
ref0 = _mm_add_epi16(ref0, T0);
ref1 = _mm_add_epi16(ref1, T1);
ref2 = _mm_add_epi16(ref2, T2);
ref3 = _mm_add_epi16(ref3, T3);
// Unsigned saturate to 8b.
ref0 = _mm_packus_epi16(ref0, ref0);
ref1 = _mm_packus_epi16(ref1, ref1);
ref2 = _mm_packus_epi16(ref2, ref2);
ref3 = _mm_packus_epi16(ref3, ref3);
// Store the results.
if (do_two) {
// Store eight bytes/pixels per line.
_mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
_mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
_mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
_mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
} else {
// Store four bytes/pixels per line.
*((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
*((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
*((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
*((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
}
}
}
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
const __m128i k7500 = _mm_set1_epi32(7500);
const __m128i k14500 = _mm_set1_epi32(14500);
const __m128i k51000 = _mm_set1_epi32(51000);
const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
5352, 2217, 5352, 2217);
const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
2217, -5352, 2217, -5352);
__m128i v01, v32;
// Difference between src and ref and initial transpose.
{
// Load src and convert to 16b.
const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
// Load ref and convert to 16b.
const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
// Compute difference.
const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
// Transpose.
// 00 01 02 03 0 0 0 0
// 10 11 12 13 0 0 0 0
// 20 21 22 23 0 0 0 0
// 30 31 32 33 0 0 0 0
const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
// a02 a12 a22 a32 a03 a13 a23 a33
// a00 a10 a20 a30 a01 a11 a21 a31
// a03 a13 a23 a33 a02 a12 a22 a32
}
// First pass and subsequent transpose.
{
// Same operations are done on the (0,3) and (1,2) pairs.
// b0 = (a0 + a3) << 3
// b1 = (a1 + a2) << 3
// b3 = (a0 - a3) << 3
// b2 = (a1 - a2) << 3
const __m128i a01 = _mm_add_epi16(v01, v32);
const __m128i a32 = _mm_sub_epi16(v01, v32);
const __m128i b01 = _mm_slli_epi16(a01, 3);
const __m128i b32 = _mm_slli_epi16(a32, 3);
const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
// e0 = b0 + b1
// e2 = b0 - b1
const __m128i e0 = _mm_add_epi16(b01, b11);
const __m128i e2 = _mm_sub_epi16(b01, b11);
const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
// e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
// e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12
const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
const __m128i d1 = _mm_add_epi32(c1, k14500);
const __m128i d3 = _mm_add_epi32(c3, k7500);
const __m128i e1 = _mm_srai_epi32(d1, 12);
const __m128i e3 = _mm_srai_epi32(d3, 12);
const __m128i e13 = _mm_packs_epi32(e1, e3);
// Transpose.
// 00 01 02 03 20 21 22 23
// 10 11 12 13 30 31 32 33
const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
// 02 12 22 32 03 13 23 33
// 00 10 20 30 01 11 21 31
// 03 13 23 33 02 12 22 32
}
// Second pass
{
// Same operations are done on the (0,3) and (1,2) pairs.
// a0 = v0 + v3
// a1 = v1 + v2
// a3 = v0 - v3
// a2 = v1 - v2
const __m128i a01 = _mm_add_epi16(v01, v32);
const __m128i a32 = _mm_sub_epi16(v01, v32);
const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
// d0 = (a0 + a1 + 7) >> 4;
// d2 = (a0 - a1 + 7) >> 4;
const __m128i b0 = _mm_add_epi16(a01, a11);
const __m128i b2 = _mm_sub_epi16(a01, a11);
const __m128i c0 = _mm_add_epi16(b0, seven);
const __m128i c2 = _mm_add_epi16(b2, seven);
const __m128i d0 = _mm_srai_epi16(c0, 4);
const __m128i d2 = _mm_srai_epi16(c2, 4);
// f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
// f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
const __m128i d3 = _mm_add_epi32(c3, k51000);
const __m128i e1 = _mm_srai_epi32(d1, 16);
const __m128i e3 = _mm_srai_epi32(d3, 16);
const __m128i f1 = _mm_packs_epi32(e1, e1);
const __m128i f3 = _mm_packs_epi32(e3, e3);
// f1 = f1 + (a3 != 0);
// The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
// desired (0, 1), we add one earlier through k12000_plus_one.
const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
_mm_storel_epi64((__m128i*)&out[ 0], d0);
_mm_storel_epi64((__m128i*)&out[ 4], g1);
_mm_storel_epi64((__m128i*)&out[ 8], d2);
_mm_storel_epi64((__m128i*)&out[12], f3);
}
}
//------------------------------------------------------------------------------
// Metric
static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_set1_epi16(0);
// Load values.
const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
// Combine pair of lines and convert to 16b.
const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
// Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
// TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
// need absolute values, there is no need to do calculation
// in 8bit as we are already in 16bit, ... Yet this is what
// benchmarks the fastest!
const __m128i d0 = _mm_subs_epu8(a01s, b01s);
const __m128i d1 = _mm_subs_epu8(b01s, a01s);
const __m128i d2 = _mm_subs_epu8(a23s, b23s);
const __m128i d3 = _mm_subs_epu8(b23s, a23s);
// Square and add them all together.
const __m128i madd0 = _mm_madd_epi16(d0, d0);
const __m128i madd1 = _mm_madd_epi16(d1, d1);
const __m128i madd2 = _mm_madd_epi16(d2, d2);
const __m128i madd3 = _mm_madd_epi16(d3, d3);
const __m128i sum0 = _mm_add_epi32(madd0, madd1);
const __m128i sum1 = _mm_add_epi32(madd2, madd3);
const __m128i sum2 = _mm_add_epi32(sum0, sum1);
int32_t tmp[4];
_mm_storeu_si128((__m128i*)tmp, sum2);
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i three = _mm_set1_epi16(3);
// Load, combine and tranpose inputs.
{
const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
// Combine inA and inB (we'll do two transforms in parallel).
const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
// a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0
// a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0
// a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0
// a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0
// Transpose the two 4x4, discarding the filling zeroes.
const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
// a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23
// a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33
const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
// a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33
// Convert to 16b.
tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Horizontal pass and subsequent transpose.
{
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
// b0_extra = (a0 != 0);
const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
const __m128i b0_base = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Vertical pass and difference of weighted sums.
{
// Load all inputs.
// TODO(cduvivier): Make variable declarations and allocations aligned so
// we can use _mm_load_si128 instead of _mm_loadu_si128.
const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// Separate the transforms of inA and inB.
__m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
__m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
__m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
__m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
{
// sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative)
const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
// b = abs(b) = (b ^ sign) - sign
A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
}
// b = abs(b) + 3
A_b0 = _mm_add_epi16(A_b0, three);
A_b2 = _mm_add_epi16(A_b2, three);
B_b0 = _mm_add_epi16(B_b0, three);
B_b2 = _mm_add_epi16(B_b2, three);
// abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
// b = (abs(b) + 3) >> 3
A_b0 = _mm_srai_epi16(A_b0, 3);
A_b2 = _mm_srai_epi16(A_b2, 3);
B_b0 = _mm_srai_epi16(B_b0, 3);
B_b2 = _mm_srai_epi16(B_b2, 3);
// weighted sums
A_b0 = _mm_madd_epi16(A_b0, w_0);
A_b2 = _mm_madd_epi16(A_b2, w_8);
B_b0 = _mm_madd_epi16(B_b0, w_0);
B_b2 = _mm_madd_epi16(B_b2, w_8);
A_b0 = _mm_add_epi32(A_b0, A_b2);
B_b0 = _mm_add_epi32(B_b0, B_b2);
// difference of weighted sums
A_b0 = _mm_sub_epi32(A_b0, B_b0);
_mm_storeu_si128((__m128i*)&sum[0], A_b0);
}
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransformSSE2(a, b, w);
return (abs(diff_sum) + 8) >> 4;
}
static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4SSE2(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Quantization
//
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
int n, const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
const __m128i zero = _mm_set1_epi16(0);
__m128i sign0, sign8;
__m128i coeff0, coeff8;
__m128i out0, out8;
__m128i packed_out;
// Load all inputs.
// TODO(cduvivier): Make variable declarations and allocations aligned so that
// we can use _mm_load_si128 instead of _mm_loadu_si128.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
// sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)
sign0 = _mm_srai_epi16(in0, 15);
sign8 = _mm_srai_epi16(in8, 15);
// coeff = abs(in) = (in ^ sign) - sign
coeff0 = _mm_xor_si128(in0, sign0);
coeff8 = _mm_xor_si128(in8, sign8);
coeff0 = _mm_sub_epi16(coeff0, sign0);
coeff8 = _mm_sub_epi16(coeff8, sign8);
// coeff = abs(in) + sharpen
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
// if (coeff > 2047) coeff = 2047
coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
// out = (coeff * iQ + B) >> QFIX;
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
__m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
__m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
__m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
__m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
// expand bias from 16b to 32b
__m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
__m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
__m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
__m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
// out = (coeff * iQ + B)
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
// out = (coeff * iQ + B) >> QFIX;
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
out_12 = _mm_srai_epi32(out_12, QFIX);
// pack result as 16b
out0 = _mm_packs_epi32(out_00, out_04);
out8 = _mm_packs_epi32(out_08, out_12);
}
// get sign back (if (sign[j]) out_n = -out_n)
out0 = _mm_xor_si128(out0, sign0);
out8 = _mm_xor_si128(out8, sign8);
out0 = _mm_sub_epi16(out0, sign0);
out8 = _mm_sub_epi16(out8, sign8);
// in = out * Q
in0 = _mm_mullo_epi16(out0, q0);
in8 = _mm_mullo_epi16(out8, q8);
// if (coeff <= mtx->zthresh_) {in=0; out=0;}
{
__m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
__m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
in0 = _mm_and_si128(in0, cmp0);
in8 = _mm_and_si128(in8, cmp8);
_mm_storeu_si128((__m128i*)&in[0], in0);
_mm_storeu_si128((__m128i*)&in[8], in8);
out0 = _mm_and_si128(out0, cmp0);
out8 = _mm_and_si128(out8, cmp8);
}
// zigzag the output before storing it.
//
// The zigzag pattern can almost be reproduced with a small sequence of
// shuffles. After it, we only need to swap the 7th (ending up in third
// position instead of twelfth) and 8th values.
{
__m128i outZ0, outZ8;
outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0));
outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1));
outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
_mm_storeu_si128((__m128i*)&out[0], outZ0);
_mm_storeu_si128((__m128i*)&out[8], outZ8);
packed_out = _mm_packs_epi16(outZ0, outZ8);
}
{
const int16_t outZ_12 = out[12];
const int16_t outZ_3 = out[3];
out[3] = outZ_12;
out[12] = outZ_3;
}
// detect if all 'out' values are zeroes or not
{
int32_t tmp[4];
_mm_storeu_si128((__m128i*)tmp, packed_out);
if (n) {
tmp[0] &= ~0xff;
}
return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
}
}
extern void VP8EncDspInitSSE2(void);
void VP8EncDspInitSSE2(void) {
VP8CollectHistogram = CollectHistogramSSE2;
VP8EncQuantizeBlock = QuantizeBlockSSE2;
VP8ITransform = ITransformSSE2;
VP8FTransform = FTransformSSE2;
VP8SSE4x4 = SSE4x4SSE2;
VP8TDisto4x4 = Disto4x4SSE2;
VP8TDisto16x16 = Disto16x16SSE2;
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif //__SSE2__

226
src/dsp/upsampling.c Normal file
View File

@@ -0,0 +1,226 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// YUV to RGB upsampling functions.
//
// Author: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#include "./yuv.h"
#include "../dec/webpi.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// Fancy upsampler
#ifdef FANCY_UPSAMPLING
// Fancy upsampling functions to convert YUV to RGB
WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[MODE_LAST];
// Given samples laid out in a square as:
// [a b]
// [c d]
// we interpolate u/v as:
// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
// We process u and v together stashed into 32bit (16bit each).
#define LOAD_UV(u,v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
if (top_y) { \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
} \
if (bottom_y) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
} \
for (x = 1; x <= last_pixel_pair; ++x) { \
const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
/* precompute invariant values associated with first and second diagonals*/\
const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
if (top_y) { \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * XSTEP); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
} \
if (bottom_y) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * XSTEP); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * XSTEP); \
} \
tl_uv = t_uv; \
l_uv = uv; \
} \
if (!(len & 1)) { \
if (top_y) { \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
} \
if (bottom_y) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
} \
} \
}
// All variants implemented.
UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
// These two don't erase the alpha value
UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePair, VP8YuvToRgb, 4)
UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePair, VP8YuvToBgr, 4)
UPSAMPLE_FUNC(UpsampleArgbKeepAlphaLinePair, VP8YuvToArgbKeepA, 4)
UPSAMPLE_FUNC(UpsampleRgba4444KeepAlphaLinePair, VP8YuvToRgba4444KeepA, 2)
#undef LOAD_UV
#undef UPSAMPLE_FUNC
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
// simple point-sampling
#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int i; \
for (i = 0; i < len - 1; i += 2) { \
FUNC(top_y[0], u[0], v[0], top_dst); \
FUNC(top_y[1], u[0], v[0], top_dst + XSTEP); \
FUNC(bottom_y[0], u[0], v[0], bottom_dst); \
FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP); \
top_y += 2; \
bottom_y += 2; \
u++; \
v++; \
top_dst += 2 * XSTEP; \
bottom_dst += 2 * XSTEP; \
} \
if (i == len - 1) { /* last one */ \
FUNC(top_y[0], u[0], v[0], top_dst); \
FUNC(bottom_y[0], u[0], v[0], bottom_dst); \
} \
}
// All variants implemented.
SAMPLE_FUNC(SampleRgbLinePair, VP8YuvToRgb, 3)
SAMPLE_FUNC(SampleBgrLinePair, VP8YuvToBgr, 3)
SAMPLE_FUNC(SampleRgbaLinePair, VP8YuvToRgba, 4)
SAMPLE_FUNC(SampleBgraLinePair, VP8YuvToBgra, 4)
SAMPLE_FUNC(SampleArgbLinePair, VP8YuvToArgb, 4)
SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2)
SAMPLE_FUNC(SampleRgb565LinePair, VP8YuvToRgb565, 2)
#undef SAMPLE_FUNC
const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = {
SampleRgbLinePair, // MODE_RGB
SampleRgbaLinePair, // MODE_RGBA
SampleBgrLinePair, // MODE_BGR
SampleBgraLinePair, // MODE_BGRA
SampleArgbLinePair, // MODE_ARGB
SampleRgba4444LinePair, // MODE_RGBA_4444
SampleRgb565LinePair // MODE_RGB_565
};
//------------------------------------------------------------------------------
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}
YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb, 3)
YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr, 3)
YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba, 4)
YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra, 4)
YUV444_FUNC(Yuv444ToArgb, VP8YuvToArgb, 4)
YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
YUV444_FUNC(Yuv444ToRgb565, VP8YuvToRgb565, 2)
#undef YUV444_FUNC
const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
Yuv444ToRgb, // MODE_RGB
Yuv444ToRgba, // MODE_RGBA
Yuv444ToBgr, // MODE_BGR
Yuv444ToBgra, // MODE_BGRA
Yuv444ToArgb, // MODE_ARGB
Yuv444ToRgba4444, // MODE_RGBA_4444
Yuv444ToRgb565 // MODE_RGB_565
};
//------------------------------------------------------------------------------
// Main call
void WebPInitUpsamplers(void) {
#ifdef FANCY_UPSAMPLING
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplersKeepAlpha[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplersKeepAlpha[MODE_RGBA] = UpsampleRgbKeepAlphaLinePair;
WebPUpsamplersKeepAlpha[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplersKeepAlpha[MODE_BGRA] = UpsampleBgrKeepAlphaLinePair;
WebPUpsamplersKeepAlpha[MODE_ARGB] = UpsampleArgbKeepAlphaLinePair;
WebPUpsamplersKeepAlpha[MODE_RGBA_4444] = UpsampleRgba4444KeepAlphaLinePair;
WebPUpsamplersKeepAlpha[MODE_RGB_565] = UpsampleRgb565LinePair;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo) {
#if defined(__SSE2__) || defined(_MSC_VER)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitUpsamplersSSE2();
}
#endif
}
#endif // FANCY_UPSAMPLING
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

215
src/dsp/upsampling_sse2.c Normal file
View File

@@ -0,0 +1,215 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// SSE2 version of YUV to RGB upsampling functions.
//
// Author: somnath@google.com (Somnath Banerjee)
#if defined(__SSE2__) || defined(_MSC_VER)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
#include "./dsp.h"
#include "./yuv.h"
#include "../dec/webpi.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#ifdef FANCY_UPSAMPLING
// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
// u = (9*a + 3*b + 3*c + d + 8) / 16
// = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
// = (a + m + 1) / 2
// where m = (a + 3*b + 3*c + d) / 8
// = ((a + b + c + d) / 2 + b + c) / 4
//
// Let's say k = (a + b + c + d) / 4.
// We can compute k as
// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
//
// Then m can be written as
// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
#define GET_M(ij, in, out) do { \
const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \
const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \
const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \
const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) | (k^in) */\
const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \
(out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \
} while (0)
// pack and store two alterning pixel rows
#define PACK_AND_STORE(a, b, da, db, out) do { \
const __m128i ta = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
const __m128i tb = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
const __m128i t1 = _mm_unpacklo_epi8(ta, tb); \
const __m128i t2 = _mm_unpackhi_epi8(ta, tb); \
_mm_store_si128(((__m128i*)(out)) + 0, t1); \
_mm_store_si128(((__m128i*)(out)) + 1, t2); \
} while (0)
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
#define UPSAMPLE_32PIXELS(r1, r2, out) { \
const __m128i one = _mm_set1_epi8(1); \
const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]); \
const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]); \
const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]); \
const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]); \
\
const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \
const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \
const __m128i st = _mm_xor_si128(s, t); /* st = s^t */ \
\
const __m128i ad = _mm_xor_si128(a, d); /* ad = a^d */ \
const __m128i bc = _mm_xor_si128(b, c); /* bc = b^c */ \
\
const __m128i t1 = _mm_or_si128(ad, bc); /* (a^d) | (b^c) */ \
const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) | (b^c) | (s^t) */ \
const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) | (b^c) | (s^t) & 1 */ \
const __m128i t4 = _mm_avg_epu8(s, t); \
const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \
__m128i diag1, diag2; \
\
GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \
PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \
}
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \
uint8_t r1[17], r2[17]; \
memcpy(r1, (tb), (num_pixels)); \
memcpy(r2, (bb), (num_pixels)); \
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
/* using the shared function instead of the macro saves ~3k code size */ \
Upsample32Pixels(r1, r2, out); \
}
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \
top_dst, bottom_dst, cur_x, num_pixels) { \
int n; \
if (top_y) { \
for (n = 0; n < (num_pixels); ++n) { \
FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \
top_dst + ((cur_x) + n) * XSTEP); \
} \
} \
if (bottom_y) { \
for (n = 0; n < (num_pixels); ++n) { \
FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \
bottom_dst + ((cur_x) + n) * XSTEP); \
} \
} \
}
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int b; \
/* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[4 * 32 + 15]; \
uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
const int uv_len = (len + 1) >> 1; \
/* 17 pixels must be read-able for each block */ \
const int num_blocks = (uv_len - 1) >> 4; \
const int leftover = uv_len - num_blocks * 16; \
const int last_pos = 1 + 32 * num_blocks; \
\
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
\
assert(len > 0); \
/* Treat the first pixel in regular way */ \
if (top_y) { \
const int u0 = (top_u[0] + u_diag) >> 1; \
const int v0 = (top_v[0] + v_diag) >> 1; \
FUNC(top_y[0], u0, v0, top_dst); \
} \
if (bottom_y) { \
const int u0 = (cur_u[0] + u_diag) >> 1; \
const int v0 = (cur_v[0] + v_diag) >> 1; \
FUNC(bottom_y[0], u0, v0, bottom_dst); \
} \
\
for (b = 0; b < num_blocks; ++b) { \
UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \
UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
32 * b + 1, 32) \
top_u += 16; \
cur_u += 16; \
top_v += 16; \
cur_v += 16; \
} \
\
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \
CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \
last_pos, len - last_pos); \
}
// SSE2 variants of the fancy upsampler.
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2, VP8YuvToRgb, 3)
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2, VP8YuvToBgr, 3)
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
// These two don't erase the alpha value
SSE2_UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePairSSE2, VP8YuvToRgb, 4)
SSE2_UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePairSSE2, VP8YuvToBgr, 4)
#undef GET_M
#undef PACK_AND_STORE
#undef UPSAMPLE_32PIXELS
#undef UPSAMPLE_LAST_BLOCK
#undef CONVERT2RGB
#undef SSE2_UPSAMPLE_FUNC
//------------------------------------------------------------------------------
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */];
#endif // FANCY_UPSAMPLING
void WebPInitUpsamplersSSE2(void) {
#ifdef FANCY_UPSAMPLING
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
WebPUpsamplersKeepAlpha[MODE_RGB] = UpsampleRgbLinePairSSE2;
WebPUpsamplersKeepAlpha[MODE_RGBA] = UpsampleRgbKeepAlphaLinePairSSE2;
WebPUpsamplersKeepAlpha[MODE_BGR] = UpsampleBgrLinePairSSE2;
WebPUpsamplersKeepAlpha[MODE_BGRA] = UpsampleBgrKeepAlphaLinePairSSE2;
#endif // FANCY_UPSAMPLING
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif //__SSE2__ || _MSC_VER

View File

@@ -9,7 +9,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "yuv.h"
#include "./yuv.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
@@ -20,9 +20,14 @@ enum { YUV_HALF = 1 << (YUV_FIX - 1) };
int16_t VP8kVToR[256], VP8kUToB[256];
int32_t VP8kVToG[256], VP8kUToG[256];
uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
static int done = 0;
static inline uint8_t clip(int v, int max_value) {
return v < 0 ? 0 : v > max_value ? max_value : v;
}
void VP8YUVInit(void) {
int i;
if (done) {
@@ -36,7 +41,8 @@ void VP8YUVInit(void) {
}
for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
VP8kClip[i - YUV_RANGE_MIN] = (k < 0) ? 0 : (k > 255) ? 255 : k;
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
}
done = 1;
}

109
src/dsp/yuv.h Normal file
View File

@@ -0,0 +1,109 @@
// Copyright 2010 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// inline YUV->RGB conversion function
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DSP_YUV_H_
#define WEBP_DSP_YUV_H_
#include "../webp/decode_vp8.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
enum { YUV_FIX = 16, // fixed-point precision
YUV_RANGE_MIN = -227, // min value of r/g/b output
YUV_RANGE_MAX = 256 + 226 // max value of r/g/b output
};
extern int16_t VP8kVToR[256], VP8kUToB[256];
extern int32_t VP8kVToG[256], VP8kUToG[256];
extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgb) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
}
static inline void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgb) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
(VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
(VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
}
static inline void VP8YuvToArgbKeepA(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
// Don't update Aplha (argb[0])
VP8YuvToRgb(y, u, v, argb + 1);
}
static inline void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
argb[0] = 0xff;
VP8YuvToArgbKeepA(y, u, v, argb);
}
static inline void VP8YuvToRgba4444KeepA(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
// Don't update Aplha (last 4 bits of argb[1])
argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
argb[1] = (argb[1] & 0x0f) | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
}
static inline void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
argb[1] = 0x0f;
VP8YuvToRgba4444KeepA(y, u, v, argb);
}
static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgr) {
const int r_off = VP8kVToR[v];
const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
const int b_off = VP8kUToB[u];
bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
}
static inline void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgra) {
VP8YuvToBgr(y, u, v, bgra);
bgra[3] = 0xff;
}
static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgba) {
VP8YuvToRgb(y, u, v, rgba);
rgba[3] = 0xff;
}
// Must be called before everything, to initialize the tables.
void VP8YUVInit(void);
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif /* WEBP_DSP_YUV_H_ */

View File

@@ -1,15 +1,13 @@
AM_CPPFLAGS = -I$(top_srcdir)/src
libwebpencode_la_SOURCES = analysis.c bit_writer.c bit_writer.h \
config.c cost.c cost.h dsp.c filter.c \
frame.c iterator.c picture.c quant.c \
syntax.c tree.c vp8enci.h webpenc.c
libwebpencode_la_LDFLAGS = -version-info 0:0:0 -lm
libwebpencode_la_SOURCES = analysis.c config.c cost.c cost.h filter.c \
frame.c iterator.c picture.c quant.c \
syntax.c tree.c vp8enci.h webpenc.c alpha.c \
layer.c
libwebpencode_la_LDFLAGS = -version-info 2:0:0 -lm
libwebpencode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
libwebpencodeinclude_HEADERS = ../webp/encode.h ../webp/types.h
libwebpencodeincludedir = $(includedir)/webp
noinst_HEADERS = cost.h bit_writer.h vp8enci.h
noinst_HEADERS = cost.h vp8enci.h
noinst_LTLIBRARIES = libwebpencode.la
# uncomment the following line (and comment the above) if you want
# to install libwebpencode library.
#lib_LTLIBRARIES = libwebpencode.la

114
src/enc/alpha.c Normal file
View File

@@ -0,0 +1,114 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Alpha-plane compression.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "vp8enci.h"
#ifdef WEBP_EXPERIMENTAL_FEATURES
#include "zlib.h"
#endif
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
#define CHUNK_SIZE 8192
//------------------------------------------------------------------------------
static int CompressAlpha(const uint8_t* data, size_t data_size,
uint8_t** output, size_t* output_size,
int algo) {
int ret = Z_OK;
z_stream strm;
unsigned char chunk[CHUNK_SIZE];
*output = NULL;
*output_size = 0;
memset(&strm, 0, sizeof(strm));
if (deflateInit(&strm, algo ? Z_BEST_SPEED : Z_BEST_COMPRESSION) != Z_OK) {
return 0;
}
strm.next_in = (unsigned char*)data;
strm.avail_in = data_size;
do {
size_t size_out;
strm.next_out = chunk;
strm.avail_out = CHUNK_SIZE;
ret = deflate(&strm, Z_FINISH);
if (ret == Z_STREAM_ERROR) {
break;
}
size_out = CHUNK_SIZE - strm.avail_out;
if (size_out) {
size_t new_size = *output_size + size_out;
uint8_t* new_output = realloc(*output, new_size);
if (new_output == NULL) {
ret = Z_MEM_ERROR;
break;
}
memcpy(new_output + *output_size, chunk, size_out);
*output_size = new_size;
*output = new_output;
}
} while (ret != Z_STREAM_END || strm.avail_out == 0);
deflateEnd(&strm);
if (ret != Z_STREAM_END) {
free(*output);
output_size = 0;
return 0;
}
return 1;
}
#endif /* WEBP_EXPERIMENTAL_FEATURES */
void VP8EncInitAlpha(VP8Encoder* enc) {
enc->has_alpha_ = (enc->pic_->a != NULL);
enc->alpha_data_ = NULL;
enc->alpha_data_size_ = 0;
}
void VP8EncCodeAlphaBlock(VP8EncIterator* it) {
(void)it;
// Nothing for now. We just ZLIB-compress in the end.
}
int VP8EncFinishAlpha(VP8Encoder* enc) {
if (enc->has_alpha_) {
#ifdef WEBP_EXPERIMENTAL_FEATURES
const WebPPicture* pic = enc->pic_;
assert(pic->a);
if (!CompressAlpha(pic->a, pic->width * pic->height,
&enc->alpha_data_, &enc->alpha_data_size_,
enc->config_->alpha_compression)) {
return 0;
}
#endif
}
return 1;
}
void VP8EncDeleteAlpha(VP8Encoder* enc) {
free(enc->alpha_data_);
enc->alpha_data_ = NULL;
enc->alpha_data_size_ = 0;
enc->has_alpha_ = 0;
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

View File

@@ -20,52 +20,13 @@
extern "C" {
#endif
#define MAX_COEFF_THRESH 64
#define MAX_ITERS_K_MEANS 6
//-----------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int ClipAlpha(int alpha) {
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
}
static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
int num = 0, den = 0, val = 0;
int k;
int alpha;
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
if (histo[k]) {
val += histo[k];
num += val * (k + 1);
den += (k + 1) * (k + 1);
}
}
// we scale the value to a usable [0..255] range
alpha = den ? 10 * num / den - 5 : 0;
return ClipAlpha(alpha);
}
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH] = { 0 };
int16_t out[16];
int j, k;
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 2;
if (v) {
const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
histo[bin - 1]++;
}
}
}
return GetAlpha(histo);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Smooth the segment map by replacing isolated block by the majority of its
// neighbours.
@@ -86,11 +47,11 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
cnt[mb[-w - 1].segment_]++; // top-left
cnt[mb[-w + 0].segment_]++; // top
cnt[mb[-w + 1].segment_]++; // top-right
cnt[mb[ - 1].segment_]++; // left
cnt[mb[ + 1].segment_]++; // right
cnt[mb[ w - 1].segment_]++; // bottom-left
cnt[mb[ w + 0].segment_]++; // bottom
cnt[mb[ w + 1].segment_]++; // bottom-right
cnt[mb[ - 1].segment_]++; // left
cnt[mb[ + 1].segment_]++; // right
cnt[mb[ w - 1].segment_]++; // bottom-left
cnt[mb[ w + 0].segment_]++; // bottom
cnt[mb[ w + 1].segment_]++; // bottom-right
for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
if (cnt[n] >= majority_cnt_3_x_3_grid) {
majority_seg = n;
@@ -108,7 +69,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
free(tmp);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Finalize Segment probability based on the coding tree
static int GetProba(int a, int b) {
@@ -178,7 +139,7 @@ static void SetSegmentAlphas(VP8Encoder* const enc,
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Simplified k-Means, to assign Nb segments based on alpha-histogram
static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
@@ -259,7 +220,7 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
SetSegmentAlphas(enc, centers, weighted_average); // pick some alphas.
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Macroblock analysis: collect histogram for each mode, deduce the maximal
// susceptibility and set best modes for this macroblock.
// Segment assignment is done later.
@@ -278,9 +239,9 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
VP8MakeLuma16Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
const int alpha = CollectHistogram(it->yuv_in_ + Y_OFF,
it->yuv_p_ + VP8I16ModeOffsets[mode],
0, 16);
const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
it->yuv_p_ + VP8I16ModeOffsets[mode],
0, 16);
if (alpha > best_alpha) {
best_alpha = alpha;
best_mode = mode;
@@ -303,9 +264,9 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
VP8MakeIntra4Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
const int alpha = CollectHistogram(src,
it->yuv_p_ + VP8I4ModeOffsets[mode],
0, 1);
const int alpha = VP8CollectHistogram(src,
it->yuv_p_ + VP8I4ModeOffsets[mode],
0, 1);
if (alpha > best_mode_alpha) {
best_mode_alpha = alpha;
modes[it->i4_] = mode;
@@ -329,9 +290,9 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
int mode;
VP8MakeChroma8Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
const int alpha = CollectHistogram(it->yuv_in_ + U_OFF,
it->yuv_p_ + VP8UVModeOffsets[mode],
16, 16 + 4 + 4);
const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
it->yuv_p_ + VP8UVModeOffsets[mode],
16, 16 + 4 + 4);
if (alpha > best_alpha) {
best_alpha = alpha;
best_mode = mode;
@@ -367,7 +328,7 @@ static void MBAnalyze(VP8EncIterator* const it,
it->mb_->alpha_ = best_alpha; // Informative only.
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Main analysis loop:
// Collect all susceptibilities for each macroblock and record their
// distribution in alphas[]. Segments is assigned a-posteriori, based on

View File

@@ -10,15 +10,15 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "webp/encode.h"
#include "../webp/encode.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// WebPConfig
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
int WebPConfigInitInternal(WebPConfig* const config,
WebPPreset preset, float quality, int version) {
@@ -41,6 +41,8 @@ int WebPConfigInitInternal(WebPConfig* const config,
config->show_compressed = 0;
config->preprocessing = 0;
config->autofilter = 0;
config->alpha_compression = 0;
config->partition_limit = 0;
// TODO(skal): tune.
switch (preset) {
@@ -105,10 +107,14 @@ int WebPValidateConfig(const WebPConfig* const config) {
return 0;
if (config->partitions < 0 || config->partitions > 3)
return 0;
if (config->partition_limit < 0 || config->partition_limit > 100)
return 0;
if (config->alpha_compression < 0)
return 0;
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -17,7 +17,7 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Boolean-cost cost table
const uint16_t VP8EntropyCost[256] = {
@@ -49,13 +49,13 @@ const uint16_t VP8EntropyCost[256] = {
10, 9, 7, 6, 4, 3
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Level cost tables
// For each given level, the following table given the pattern of contexts
// to use for coding it (in [][0]) as well as the bit value to use for
// each context (in [][1]).
static const uint16_t kLevelCodes[MAX_VARIABLE_LEVEL][2] = {
const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
{0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
{0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
{0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},
@@ -337,8 +337,8 @@ const uint16_t VP8LevelFixedCosts[2048] = {
};
static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
int pattern = kLevelCodes[level - 1][0];
int bits = kLevelCodes[level - 1][1];
int pattern = VP8LevelCodes[level - 1][0];
int bits = VP8LevelCodes[level - 1][1];
int cost = 0;
int i;
for (i = 2; pattern; ++i) {
@@ -351,7 +351,7 @@ static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
return cost;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Pre-calc level costs once for all
void VP8CalculateLevelCosts(VP8Proba* const proba) {
@@ -374,12 +374,13 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Mode cost tables.
// These are the fixed probabilities (in the coding trees) turned into bit-cost
// by calling VP8BitCost().
const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
// note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
{ { 251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
@@ -484,7 +485,7 @@ const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
{ 516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543, 712 } },
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -27,11 +27,13 @@ static inline int VP8BitCost(int bit, uint8_t proba) {
}
// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total, uint8_t proba) {
static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total,
uint8_t proba) {
return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
}
// Level cost calculations
extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
void VP8CalculateLevelCosts(VP8Proba* const proba);
static inline int VP8LevelCost(const uint16_t* const table, int level) {
return VP8LevelFixedCosts[level]
@@ -43,10 +45,10 @@ extern const uint16_t VP8FixedCostsUV[4];
extern const uint16_t VP8FixedCostsI16[4];
extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // WEBP_ENC_COST_H_
#endif /* WEBP_ENC_COST_H_ */

View File

@@ -16,7 +16,55 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int ClipAlpha(int alpha) {
return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
}
int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
int num = 0, den = 0, val = 0;
int k;
int alpha;
// note: changing this loop to avoid the numerous "k + 1" slows things down.
for (k = 0; k < MAX_COEFF_THRESH; ++k) {
if (histo[k + 1]) {
val += histo[k + 1];
num += val * (k + 1);
den += (k + 1) * (k + 1);
}
}
// we scale the value to a usable [0..255] range
alpha = den ? 10 * num / den - 5 : 0;
return ClipAlpha(alpha);
}
static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH + 1] = { 0 };
int16_t out[16];
int j, k;
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
// Convert coefficients to bin (within out[]).
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 2;
out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
}
// Use bin to update histogram.
for (k = 0; k < 16; ++k) {
histo[out[k]]++;
}
}
return VP8GetAlpha(histo);
}
//------------------------------------------------------------------------------
// run-time tables (~4k)
static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
@@ -39,7 +87,7 @@ static inline uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#define STORE(x, y, v) \
@@ -49,7 +97,8 @@ static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) {
static inline void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@@ -81,6 +130,14 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) {
}
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
@@ -166,16 +223,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
}
}
// default C implementations:
VP8Idct VP8ITransform = ITransform;
VP8Fdct VP8FTransform = FTransform;
VP8WHT VP8ITransformWHT = ITransformWHT;
VP8WHT VP8FTransformWHT = FTransformWHT;
#undef MUL
#undef STORE
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Intra predictions
#define OUT(x, y) dst[(x) + (y) * BPS]
@@ -260,7 +311,7 @@ static inline void DCMode(uint8_t* dst, const uint8_t* left,
Fill(dst, DC, size);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
@@ -280,7 +331,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
TrueMotion(C8TM8 + dst, left, top, 8);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
@@ -291,7 +342,7 @@ static void Intra16Preds(uint8_t* dst,
TrueMotion(I16TM16 + dst, left, top, 16);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// luma 4x4 prediction
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@@ -478,12 +529,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
HU4(I4HU4 + dst, top);
}
// default C implementations
VP8Intra4Preds VP8EncPredLuma4 = Intra4Preds;
VP8IntraPreds VP8EncPredLuma16 = Intra16Preds;
VP8IntraPreds VP8EncPredChroma8 = IntraChromaPreds;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Metric
static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
@@ -513,22 +559,19 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
// default C implementations
VP8Metric VP8SSE16x16 = SSE16x16;
VP8Metric VP8SSE8x8 = SSE8x8;
VP8Metric VP8SSE16x8 = SSE16x8;
VP8Metric VP8SSE4x4 = SSE4x4;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// Hadamard transform
static void TTransform(const uint8_t* in, int16_t* out) {
// Returns the weighted sum of the absolute value of transformed coefficients.
static int TTransform(const uint8_t* in, const uint16_t* w) {
int sum = 0;
int tmp[16];
int i;
// horizontal pass
for (i = 0; i < 4; ++i, in += BPS) {
const int a0 = (in[0] + in[2]) << 2;
const int a1 = (in[1] + in[3]) << 2;
@@ -539,7 +582,8 @@ static void TTransform(const uint8_t* in, int16_t* out) {
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
for (i = 0; i < 4; ++i) {
// vertical pass
for (i = 0; i < 4; ++i, ++w) {
const int a0 = (tmp[0 + i] + tmp[8 + i]);
const int a1 = (tmp[4 + i] + tmp[12+ i]);
const int a2 = (tmp[4 + i] - tmp[12+ i]);
@@ -548,24 +592,20 @@ static void TTransform(const uint8_t* in, int16_t* out) {
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
out[ 0 + i] = (b0 + (b0 < 0) + 3) >> 3;
out[ 4 + i] = (b1 + (b1 < 0) + 3) >> 3;
out[ 8 + i] = (b2 + (b2 < 0) + 3) >> 3;
out[12 + i] = (b3 + (b3 < 0) + 3) >> 3;
// abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
sum += w[ 0] * ((abs(b0) + 3) >> 3);
sum += w[ 4] * ((abs(b1) + 3) >> 3);
sum += w[ 8] * ((abs(b2) + 3) >> 3);
sum += w[12] * ((abs(b3) + 3) >> 3);
}
return sum;
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int16_t tmp1[16], tmp2[16];
int k;
int D;
TTransform(a, tmp1);
TTransform(b, tmp2);
D = 0;
for (k = 0; k < 16; ++k)
D += w[k] * (abs(tmp2[k]) - abs(tmp1[k]));
return (abs(D) + 8) >> 4;
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return (abs(sum2 - sum1) + 8) >> 4;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
@@ -580,10 +620,7 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
return D;
}
VP8WMetric VP8TDisto4x4 = Disto4x4;
VP8WMetric VP8TDisto16x16 = Disto16x16;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Quantization
//
@@ -612,10 +649,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (last >= 0);
}
// default C implementation
VP8QuantizeBlock VP8EncQuantizeBlock = QuantizeBlock;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Block copy
static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
@@ -631,15 +665,104 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }
// default C implementations
VP8BlockCopy VP8Copy4x4 = Copy4x4;
VP8BlockCopy VP8Copy8x8 = Copy8x8;
VP8BlockCopy VP8Copy16x16 = Copy16x16;
//------------------------------------------------------------------------------
// SSE2 detection.
//
//-----------------------------------------------------------------------------
#if defined(__pic__) && defined(__i386__)
static inline void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"mov %%ebx, %%edi\n"
"cpuid\n"
"xchg %%edi, %%ebx\n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
}
#elif defined(__i386__) || defined(__x86_64__)
static inline void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"cpuid\n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type));
}
#elif defined(_MSC_VER) // Visual C++
#define GetCPUInfo __cpuid
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
static int x86CPUInfo(CPUFeature feature) {
int cpu_info[4];
GetCPUInfo(cpu_info, 1);
if (feature == kSSE2) {
return 0 != (cpu_info[3] & 0x04000000);
}
if (feature == kSSE3) {
return 0 != (cpu_info[2] & 0x00000001);
}
return 0;
}
VP8CPUInfo VP8EncGetCPUInfo = x86CPUInfo;
#else
VP8CPUInfo VP8EncGetCPUInfo = NULL;
#endif
// Speed-critical function pointers. We have to initialize them to the default
// implementations within VP8EncDspInit().
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
VP8WHT VP8ITransformWHT;
VP8WHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
VP8IntraPreds VP8EncPredChroma8;
VP8Metric VP8SSE16x16;
VP8Metric VP8SSE8x8;
VP8Metric VP8SSE16x8;
VP8Metric VP8SSE4x4;
VP8WMetric VP8TDisto4x4;
VP8WMetric VP8TDisto16x16;
VP8QuantizeBlock VP8EncQuantizeBlock;
VP8BlockCopy VP8Copy4x4;
VP8BlockCopy VP8Copy8x8;
VP8BlockCopy VP8Copy16x16;
extern void VP8EncDspInitSSE2(void);
void VP8EncDspInit(void) {
InitTables();
// default C implementations
VP8CollectHistogram = CollectHistogram;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8ITransformWHT = ITransformWHT;
VP8FTransformWHT = FTransformWHT;
VP8EncPredLuma4 = Intra4Preds;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8EncQuantizeBlock = QuantizeBlock;
VP8Copy4x4 = Copy4x4;
VP8Copy8x8 = Copy8x8;
VP8Copy16x16 = Copy16x16;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8EncGetCPUInfo) {
if (VP8EncGetCPUInfo(kSSE2)) {
#if defined(__SSE2__) || defined(_MSC_VER)
VP8EncDspInitSSE2();
#endif
}
if (VP8EncGetCPUInfo(kSSE3)) {
// later we'll plug some SSE3 variant here
}
}
}
#if defined(__cplusplus) || defined(c_plusplus)

834
src/enc/enc_sse2.c Normal file
View File

@@ -0,0 +1,834 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// SSE2 version of speed-critical functions.
//
// Author: Christian Duvivier (cduvivier@google.com)
#if defined(__SSE2__) || defined(_MSC_VER)
#include <emmintrin.h>
#include "vp8enci.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block) {
int histo[MAX_COEFF_THRESH + 1] = { 0 };
int16_t out[16];
int j, k;
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
for (j = start_block; j < end_block; ++j) {
VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
// Convert coefficients to bin (within out[]).
{
// Load.
const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
// sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative)
const __m128i sign0 = _mm_srai_epi16(out0, 15);
const __m128i sign1 = _mm_srai_epi16(out1, 15);
// abs(out) = (out ^ sign) - sign
const __m128i xor0 = _mm_xor_si128(out0, sign0);
const __m128i xor1 = _mm_xor_si128(out1, sign1);
const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
// v = abs(out) >> 2
const __m128i v0 = _mm_srai_epi16(abs0, 2);
const __m128i v1 = _mm_srai_epi16(abs1, 2);
// bin = min(v, MAX_COEFF_THRESH)
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
// Store.
_mm_storeu_si128((__m128i*)&out[0], bin0);
_mm_storeu_si128((__m128i*)&out[8], bin1);
}
// Use bin to update histogram.
for (k = 0; k < 16; ++k) {
histo[out[k]]++;
}
}
return VP8GetAlpha(histo);
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
// K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
//
// To be able to use signed 16-bit integers, we use the following trick to
// have constants within range:
// - Associated constants are obtained by subtracting the 16-bit fixed point
// version of one:
// k = K - (1 << 16) => K = k + (1 << 16)
// K1 = 85267 => k1 = 20091
// K2 = 35468 => k2 = -30068
// - The multiplication of a variable by a constant become the sum of the
// variable and the multiplication of that variable by the associated
// constant:
// (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
const __m128i k1 = _mm_set1_epi16(20091);
const __m128i k2 = _mm_set1_epi16(-30068);
__m128i T0, T1, T2, T3;
// Load and concatenate the transform coefficients (we'll do two inverse
// transforms in parallel). In the case of only one inverse transform, the
// second half of the vectors will just contain random value we'll never
// use nor store.
__m128i in0, in1, in2, in3;
{
in0 = _mm_loadl_epi64((__m128i*)&in[0]);
in1 = _mm_loadl_epi64((__m128i*)&in[4]);
in2 = _mm_loadl_epi64((__m128i*)&in[8]);
in3 = _mm_loadl_epi64((__m128i*)&in[12]);
// a00 a10 a20 a30 x x x x
// a01 a11 a21 a31 x x x x
// a02 a12 a22 a32 x x x x
// a03 a13 a23 a33 x x x x
if (do_two) {
const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
in0 = _mm_unpacklo_epi64(in0, inB0);
in1 = _mm_unpacklo_epi64(in1, inB1);
in2 = _mm_unpacklo_epi64(in2, inB2);
in3 = _mm_unpacklo_epi64(in3, inB3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
}
// Vertical pass and subsequent transpose.
{
// First pass, c and d calculations are longer because of the "trick"
// multiplications.
const __m128i a = _mm_add_epi16(in0, in2);
const __m128i b = _mm_sub_epi16(in0, in2);
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
const __m128i c1 = _mm_mulhi_epi16(in1, k2);
const __m128i c2 = _mm_mulhi_epi16(in3, k1);
const __m128i c3 = _mm_sub_epi16(in1, in3);
const __m128i c4 = _mm_sub_epi16(c1, c2);
const __m128i c = _mm_add_epi16(c3, c4);
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
const __m128i d1 = _mm_mulhi_epi16(in1, k1);
const __m128i d2 = _mm_mulhi_epi16(in3, k2);
const __m128i d3 = _mm_add_epi16(in1, in3);
const __m128i d4 = _mm_add_epi16(d1, d2);
const __m128i d = _mm_add_epi16(d3, d4);
// Second pass.
const __m128i tmp0 = _mm_add_epi16(a, d);
const __m128i tmp1 = _mm_add_epi16(b, c);
const __m128i tmp2 = _mm_sub_epi16(b, c);
const __m128i tmp3 = _mm_sub_epi16(a, d);
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Horizontal pass and subsequent transpose.
{
// First pass, c and d calculations are longer because of the "trick"
// multiplications.
const __m128i four = _mm_set1_epi16(4);
const __m128i dc = _mm_add_epi16(T0, four);
const __m128i a = _mm_add_epi16(dc, T2);
const __m128i b = _mm_sub_epi16(dc, T2);
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
const __m128i c1 = _mm_mulhi_epi16(T1, k2);
const __m128i c2 = _mm_mulhi_epi16(T3, k1);
const __m128i c3 = _mm_sub_epi16(T1, T3);
const __m128i c4 = _mm_sub_epi16(c1, c2);
const __m128i c = _mm_add_epi16(c3, c4);
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
const __m128i d1 = _mm_mulhi_epi16(T1, k1);
const __m128i d2 = _mm_mulhi_epi16(T3, k2);
const __m128i d3 = _mm_add_epi16(T1, T3);
const __m128i d4 = _mm_add_epi16(d1, d2);
const __m128i d = _mm_add_epi16(d3, d4);
// Second pass.
const __m128i tmp0 = _mm_add_epi16(a, d);
const __m128i tmp1 = _mm_add_epi16(b, c);
const __m128i tmp2 = _mm_sub_epi16(b, c);
const __m128i tmp3 = _mm_sub_epi16(a, d);
const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Add inverse transform to 'ref' and store.
{
const __m128i zero = _mm_set1_epi16(0);
// Load the reference(s).
__m128i ref0, ref1, ref2, ref3;
if (do_two) {
// Load eight bytes/pixels per line.
ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
} else {
// Load four bytes/pixels per line.
ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
}
// Convert to 16b.
ref0 = _mm_unpacklo_epi8(ref0, zero);
ref1 = _mm_unpacklo_epi8(ref1, zero);
ref2 = _mm_unpacklo_epi8(ref2, zero);
ref3 = _mm_unpacklo_epi8(ref3, zero);
// Add the inverse transform(s).
ref0 = _mm_add_epi16(ref0, T0);
ref1 = _mm_add_epi16(ref1, T1);
ref2 = _mm_add_epi16(ref2, T2);
ref3 = _mm_add_epi16(ref3, T3);
// Unsigned saturate to 8b.
ref0 = _mm_packus_epi16(ref0, ref0);
ref1 = _mm_packus_epi16(ref1, ref1);
ref2 = _mm_packus_epi16(ref2, ref2);
ref3 = _mm_packus_epi16(ref3, ref3);
// Store the results.
if (do_two) {
// Store eight bytes/pixels per line.
_mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
_mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
_mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
_mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
} else {
// Store four bytes/pixels per line.
*((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
*((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
*((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
*((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
}
}
}
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
const __m128i k7500 = _mm_set1_epi32(7500);
const __m128i k14500 = _mm_set1_epi32(14500);
const __m128i k51000 = _mm_set1_epi32(51000);
const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
5352, 2217, 5352, 2217);
const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
2217, -5352, 2217, -5352);
__m128i v01, v32;
// Difference between src and ref and initial transpose.
{
// Load src and convert to 16b.
const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
// Load ref and convert to 16b.
const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
// Compute difference.
const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
// Transpose.
// 00 01 02 03 0 0 0 0
// 10 11 12 13 0 0 0 0
// 20 21 22 23 0 0 0 0
// 30 31 32 33 0 0 0 0
const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
// a02 a12 a22 a32 a03 a13 a23 a33
// a00 a10 a20 a30 a01 a11 a21 a31
// a03 a13 a23 a33 a02 a12 a22 a32
}
// First pass and subsequent transpose.
{
// Same operations are done on the (0,3) and (1,2) pairs.
// b0 = (a0 + a3) << 3
// b1 = (a1 + a2) << 3
// b3 = (a0 - a3) << 3
// b2 = (a1 - a2) << 3
const __m128i a01 = _mm_add_epi16(v01, v32);
const __m128i a32 = _mm_sub_epi16(v01, v32);
const __m128i b01 = _mm_slli_epi16(a01, 3);
const __m128i b32 = _mm_slli_epi16(a32, 3);
const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
// e0 = b0 + b1
// e2 = b0 - b1
const __m128i e0 = _mm_add_epi16(b01, b11);
const __m128i e2 = _mm_sub_epi16(b01, b11);
const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
// e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
// e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12
const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
const __m128i d1 = _mm_add_epi32(c1, k14500);
const __m128i d3 = _mm_add_epi32(c3, k7500);
const __m128i e1 = _mm_srai_epi32(d1, 12);
const __m128i e3 = _mm_srai_epi32(d3, 12);
const __m128i e13 = _mm_packs_epi32(e1, e3);
// Transpose.
// 00 01 02 03 20 21 22 23
// 10 11 12 13 30 31 32 33
const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
// 02 12 22 32 03 13 23 33
// 00 10 20 30 01 11 21 31
// 03 13 23 33 02 12 22 32
}
// Second pass
{
// Same operations are done on the (0,3) and (1,2) pairs.
// a0 = v0 + v3
// a1 = v1 + v2
// a3 = v0 - v3
// a2 = v1 - v2
const __m128i a01 = _mm_add_epi16(v01, v32);
const __m128i a32 = _mm_sub_epi16(v01, v32);
const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
// d0 = (a0 + a1 + 7) >> 4;
// d2 = (a0 - a1 + 7) >> 4;
const __m128i b0 = _mm_add_epi16(a01, a11);
const __m128i b2 = _mm_sub_epi16(a01, a11);
const __m128i c0 = _mm_add_epi16(b0, seven);
const __m128i c2 = _mm_add_epi16(b2, seven);
const __m128i d0 = _mm_srai_epi16(c0, 4);
const __m128i d2 = _mm_srai_epi16(c2, 4);
// f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
// f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
const __m128i d3 = _mm_add_epi32(c3, k51000);
const __m128i e1 = _mm_srai_epi32(d1, 16);
const __m128i e3 = _mm_srai_epi32(d3, 16);
const __m128i f1 = _mm_packs_epi32(e1, e1);
const __m128i f3 = _mm_packs_epi32(e3, e3);
// f1 = f1 + (a3 != 0);
// The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
// desired (0, 1), we add one earlier through k12000_plus_one.
const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
_mm_storel_epi64((__m128i*)&out[ 0], d0);
_mm_storel_epi64((__m128i*)&out[ 4], g1);
_mm_storel_epi64((__m128i*)&out[ 8], d2);
_mm_storel_epi64((__m128i*)&out[12], f3);
}
}
//------------------------------------------------------------------------------
// Metric
static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_set1_epi16(0);
// Load values.
const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
// Combine pair of lines and convert to 16b.
const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
// Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
// TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
// need absolute values, there is no need to do calculation
// in 8bit as we are already in 16bit, ... Yet this is what
// benchmarks the fastest!
const __m128i d0 = _mm_subs_epu8(a01s, b01s);
const __m128i d1 = _mm_subs_epu8(b01s, a01s);
const __m128i d2 = _mm_subs_epu8(a23s, b23s);
const __m128i d3 = _mm_subs_epu8(b23s, a23s);
// Square and add them all together.
const __m128i madd0 = _mm_madd_epi16(d0, d0);
const __m128i madd1 = _mm_madd_epi16(d1, d1);
const __m128i madd2 = _mm_madd_epi16(d2, d2);
const __m128i madd3 = _mm_madd_epi16(d3, d3);
const __m128i sum0 = _mm_add_epi32(madd0, madd1);
const __m128i sum1 = _mm_add_epi32(madd2, madd3);
const __m128i sum2 = _mm_add_epi32(sum0, sum1);
int32_t tmp[4];
_mm_storeu_si128((__m128i*)tmp, sum2);
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i three = _mm_set1_epi16(3);
// Load, combine and tranpose inputs.
{
const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
// Combine inA and inB (we'll do two transforms in parallel).
const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
// a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0
// a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0
// a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0
// a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0
// Transpose the two 4x4, discarding the filling zeroes.
const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
// a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23
// a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33
const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
// a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33
// Convert to 16b.
tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Horizontal pass and subsequent transpose.
{
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
// b0_extra = (a0 != 0);
const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
const __m128i b0_base = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
// Vertical pass and difference of weighted sums.
{
// Load all inputs.
// TODO(cduvivier): Make variable declarations and allocations aligned so
// we can use _mm_load_si128 instead of _mm_loadu_si128.
const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// Separate the transforms of inA and inB.
__m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
__m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
__m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
__m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
{
// sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative)
const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
// b = abs(b) = (b ^ sign) - sign
A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
}
// b = abs(b) + 3
A_b0 = _mm_add_epi16(A_b0, three);
A_b2 = _mm_add_epi16(A_b2, three);
B_b0 = _mm_add_epi16(B_b0, three);
B_b2 = _mm_add_epi16(B_b2, three);
// abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
// b = (abs(b) + 3) >> 3
A_b0 = _mm_srai_epi16(A_b0, 3);
A_b2 = _mm_srai_epi16(A_b2, 3);
B_b0 = _mm_srai_epi16(B_b0, 3);
B_b2 = _mm_srai_epi16(B_b2, 3);
// weighted sums
A_b0 = _mm_madd_epi16(A_b0, w_0);
A_b2 = _mm_madd_epi16(A_b2, w_8);
B_b0 = _mm_madd_epi16(B_b0, w_0);
B_b2 = _mm_madd_epi16(B_b2, w_8);
A_b0 = _mm_add_epi32(A_b0, A_b2);
B_b0 = _mm_add_epi32(B_b0, B_b2);
// difference of weighted sums
A_b0 = _mm_sub_epi32(A_b0, B_b0);
_mm_storeu_si128((__m128i*)&sum[0], A_b0);
}
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransformSSE2(a, b, w);
return (abs(diff_sum) + 8) >> 4;
}
static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4SSE2(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Quantization
//
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
int n, const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
const __m128i zero = _mm_set1_epi16(0);
__m128i sign0, sign8;
__m128i coeff0, coeff8;
__m128i out0, out8;
__m128i packed_out;
// Load all inputs.
// TODO(cduvivier): Make variable declarations and allocations aligned so that
// we can use _mm_load_si128 instead of _mm_loadu_si128.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
// sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)
sign0 = _mm_srai_epi16(in0, 15);
sign8 = _mm_srai_epi16(in8, 15);
// coeff = abs(in) = (in ^ sign) - sign
coeff0 = _mm_xor_si128(in0, sign0);
coeff8 = _mm_xor_si128(in8, sign8);
coeff0 = _mm_sub_epi16(coeff0, sign0);
coeff8 = _mm_sub_epi16(coeff8, sign8);
// coeff = abs(in) + sharpen
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
// if (coeff > 2047) coeff = 2047
coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
// out = (coeff * iQ + B) >> QFIX;
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
__m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
__m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
__m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
__m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
// expand bias from 16b to 32b
__m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
__m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
__m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
__m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
// out = (coeff * iQ + B)
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
// out = (coeff * iQ + B) >> QFIX;
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
out_12 = _mm_srai_epi32(out_12, QFIX);
// pack result as 16b
out0 = _mm_packs_epi32(out_00, out_04);
out8 = _mm_packs_epi32(out_08, out_12);
}
// get sign back (if (sign[j]) out_n = -out_n)
out0 = _mm_xor_si128(out0, sign0);
out8 = _mm_xor_si128(out8, sign8);
out0 = _mm_sub_epi16(out0, sign0);
out8 = _mm_sub_epi16(out8, sign8);
// in = out * Q
in0 = _mm_mullo_epi16(out0, q0);
in8 = _mm_mullo_epi16(out8, q8);
// if (coeff <= mtx->zthresh_) {in=0; out=0;}
{
__m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
__m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
in0 = _mm_and_si128(in0, cmp0);
in8 = _mm_and_si128(in8, cmp8);
_mm_storeu_si128((__m128i*)&in[0], in0);
_mm_storeu_si128((__m128i*)&in[8], in8);
out0 = _mm_and_si128(out0, cmp0);
out8 = _mm_and_si128(out8, cmp8);
}
// zigzag the output before storing it.
//
// The zigzag pattern can almost be reproduced with a small sequence of
// shuffles. After it, we only need to swap the 7th (ending up in third
// position instead of twelfth) and 8th values.
{
__m128i outZ0, outZ8;
outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0));
outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1));
outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
_mm_storeu_si128((__m128i*)&out[0], outZ0);
_mm_storeu_si128((__m128i*)&out[8], outZ8);
packed_out = _mm_packs_epi16(outZ0, outZ8);
}
{
const int16_t outZ_12 = out[12];
const int16_t outZ_3 = out[3];
out[3] = outZ_12;
out[12] = outZ_3;
}
// detect if all 'out' values are zeroes or not
{
int32_t tmp[4];
_mm_storeu_si128((__m128i*)tmp, packed_out);
if (n) {
tmp[0] &= ~0xff;
}
return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
}
}
extern void VP8EncDspInitSSE2(void);
void VP8EncDspInitSSE2(void) {
VP8CollectHistogram = CollectHistogramSSE2;
VP8EncQuantizeBlock = QuantizeBlockSSE2;
VP8ITransform = ITransformSSE2;
VP8FTransform = FTransformSSE2;
VP8SSE4x4 = SSE4x4SSE2;
VP8TDisto4x4 = Disto4x4SSE2;
VP8TDisto16x16 = Disto16x16SSE2;
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif //__SSE2__

View File

@@ -45,7 +45,7 @@ static void InitTables(void) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Edge filtering functions
// 4 pixels in, 2 pixels out
@@ -92,7 +92,7 @@ static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
@@ -129,7 +129,7 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
@@ -177,7 +177,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
void (*VP8EncVFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
void (*VP8EncHFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
@@ -187,7 +187,7 @@ void (*VP8EncHFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
void (*VP8EncSimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
void (*VP8EncSimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Paragraph 15.4: compute the inner-edge filtering strength
static int GetILevel(int sharpness, int level) {
@@ -229,7 +229,7 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// SSIM metric
enum { KERNEL = 3 };
@@ -302,7 +302,7 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
return GetSSIM(&s);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Exposed APIs: Encoder should call the following 3 functions to adjust
// loop filter strength

View File

@@ -37,7 +37,7 @@ typedef struct {
CostArray* cost;
} VP8Residual;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Tables for level coding
const uint8_t VP8EncBands[16 + 1] = {
@@ -51,18 +51,16 @@ static const uint8_t kCat5[] = { 180, 157, 141, 134, 130 };
static const uint8_t kCat6[] =
{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Reset the statistics about: number of skips, token proba, level cost,...
static void ResetStats(VP8Encoder* const enc, int precalc_cost) {
VP8Proba* const proba = &enc->proba_;
if (precalc_cost) VP8CalculateLevelCosts(proba);
proba->nb_skip_ = 0;
proba->nb_i4_ = 0;
proba->nb_i16_ = 0;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Skip decision probability
static int CalcSkipProba(uint64_t nb, uint64_t total) {
@@ -86,7 +84,7 @@ static int FinalizeSkipProba(VP8Encoder* const enc) {
return size;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Recording of token probabilities.
static void ResetTokenStats(VP8Encoder* const enc) {
@@ -101,6 +99,9 @@ static int Record(int bit, uint64_t* const stats) {
return bit;
}
// We keep the table free variant around for reference, in case.
#define USE_LEVEL_CODE_TABLE
// Simulate block coding, but only record statistics.
// Note: no need to record the fixed probas.
static int RecordCoeffs(int ctx, VP8Residual* res) {
@@ -111,14 +112,16 @@ static int RecordCoeffs(int ctx, VP8Residual* res) {
}
while (1) {
const int v = abs(res->coeffs[n++]);
int v = res->coeffs[n++];
if (!Record(v != 0, s[1])) {
s = res->stats[VP8EncBands[n]][0];
continue;
}
if (!Record(v > 1, s[2])) {
if (!Record(2u < (unsigned int)(v + 1), s[2])) { // v = -1 or 1
s = res->stats[VP8EncBands[n]][1];
} else {
v = abs(v);
#if !defined(USE_LEVEL_CODE_TABLE)
if (!Record(v > 4, s[3])) {
if (Record(v != 2, s[4]))
Record(v == 4, s[5]);
@@ -129,6 +132,20 @@ static int RecordCoeffs(int ctx, VP8Residual* res) {
} else {
Record((v >= 3 + (8 << 3)), s[10]);
}
#else
if (v > MAX_VARIABLE_LEVEL)
v = MAX_VARIABLE_LEVEL;
{
const int bits = VP8LevelCodes[v - 1][1];
int pattern = VP8LevelCodes[v - 1][0];
int i;
for (i = 0; (pattern >>= 1) != 0; ++i) {
const int mask = 2 << i;
if (pattern & 1) Record(!!(bits & mask), s[3 + i]);
}
}
#endif
s = res->stats[VP8EncBands[n]][2];
}
if (n == 16 || !Record(n <= res->last, s[0])) {
@@ -174,7 +191,7 @@ static int FinalizeTokenProbas(VP8Encoder* const enc) {
return size;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// helper functions for residuals struct VP8Residual.
static void InitResidual(int first, int coeff_type,
@@ -199,7 +216,7 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
res->coeffs = coeffs;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Mode costs
static int GetResidualCost(int ctx, const VP8Residual* const res) {
@@ -213,16 +230,18 @@ static int GetResidualCost(int ctx, const VP8Residual* const res) {
return cost;
}
while (n <= res->last) {
const int v = abs(res->coeffs[n++]);
cost += VP8LevelCost(t, v);
const int v = res->coeffs[n++];
if (v == 0) {
cost += VP8LevelCost(t, 0);
p = res->prob[VP8EncBands[n]][0];
t = res->cost[VP8EncBands[n]][0];
continue;
} else if (v == 1) {
} else if (2u >= (unsigned int)(v + 1)) { // v = -1 or 1
cost += VP8LevelCost(t, 1);
p = res->prob[VP8EncBands[n]][1];
t = res->cost[VP8EncBands[n]][1];
} else {
cost += VP8LevelCost(t, abs(v));
p = res->prob[VP8EncBands[n]][2];
t = res->cost[VP8EncBands[n]][2];
}
@@ -292,7 +311,7 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
return R;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Coefficient coding
static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
@@ -462,7 +481,7 @@ static void RecordResiduals(VP8EncIterator* const it,
VP8IteratorBytesToNz(it);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// ExtraInfo map / Debug function
#if SEGMENT_VISU
@@ -525,7 +544,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
#endif
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Main loops
//
// VP8EncLoop(): does the final bitstream coding.
@@ -568,6 +587,14 @@ int VP8EncLoop(VP8Encoder* const enc) {
} else { // reset predictors after a skip
ResetAfterSkip(&it);
}
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (enc->has_alpha_) {
VP8EncCodeAlphaBlock(&it);
}
if (enc->use_layer_) {
VP8EncCodeLayerBlock(&it);
}
#endif
StoreSideInfo(&it);
VP8StoreFilterStats(&it);
VP8IteratorExport(&it);
@@ -589,7 +616,7 @@ int VP8EncLoop(VP8Encoder* const enc) {
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// VP8StatLoop(): only collect statistics (number of skips, token usage, ...)
// This is used for deciding optimal probabilities. It also
// modifies the quantizer value if some target (size, PNSR)
@@ -664,7 +691,7 @@ int VP8StatLoop(VP8Encoder* const enc) {
}
// binary search for a size close to target
for (pass = 0; pass < enc->config_->pass || (dqs[pass] > 0); ++pass) {
for (pass = 0; pass < enc->config_->pass && (dqs[pass] > 0); ++pass) {
const int rd_opt = 1;
float PSNR;
int criterion;
@@ -688,7 +715,7 @@ int VP8StatLoop(VP8Encoder* const enc) {
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -17,9 +17,9 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// VP8Iterator
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static void InitLeft(VP8EncIterator* const it) {
const VP8Encoder* const enc = it->enc_;
@@ -68,7 +68,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
VP8IteratorReset(it);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Import the source samples into the cache. Takes care of replicating
// boundary pixels if necessary.
@@ -122,7 +122,7 @@ void VP8IteratorImport(const VP8EncIterator* const it) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Copy back the compressed samples into user space if requested.
void VP8IteratorExport(const VP8EncIterator* const it) {
@@ -148,16 +148,18 @@ void VP8IteratorExport(const VP8EncIterator* const it) {
memcpy(ydst + i * pic->y_stride, ysrc + i * BPS, w);
}
// U/V plane
w = (w + 1) / 2;
h = (h + 1) / 2;
for (i = 0; i < h; ++i) {
memcpy(udst + i * pic->uv_stride, usrc + i * BPS, w);
memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, w);
{
const int uv_w = (w + 1) / 2;
const int uv_h = (h + 1) / 2;
for (i = 0; i < uv_h; ++i) {
memcpy(udst + i * pic->uv_stride, usrc + i * BPS, uv_w);
memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, uv_w);
}
}
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Non-zero contexts setup/teardown
// Nz bits:
@@ -214,7 +216,8 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
nz |= (it->top_nz_[6] << 22) | (it->top_nz_[7] << 23);
nz |= (it->top_nz_[8] << 24); // we propagate the _top_ bit, esp. for intra4
// left
nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7) | (it->left_nz_[2] << 11);
nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7);
nz |= (it->left_nz_[2] << 11);
nz |= (it->left_nz_[4] << 17) | (it->left_nz_[6] << 21);
*it->nz_ = nz;
@@ -222,7 +225,7 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
#undef BIT
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Advance to the next position, doing the bookeeping.
int VP8IteratorNext(VP8EncIterator* const it,
@@ -267,7 +270,7 @@ int VP8IteratorNext(VP8EncIterator* const it,
return (0 < --it->done_);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Helper function to set mode properties
void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
@@ -304,7 +307,7 @@ void VP8SetSegment(const VP8EncIterator* const it, int segment) {
it->mb_->segment_ = segment;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Intra4x4 sub-blocks iteration
//
// We store and update the boundary samples into an array of 37 pixels. They
@@ -399,7 +402,7 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

55
src/enc/layer.c Normal file
View File

@@ -0,0 +1,55 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Enhancement layer (for YUV444/422)
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "vp8enci.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
#endif /* WEBP_EXPERIMENTAL_FEATURES */
//------------------------------------------------------------------------------
void VP8EncInitLayer(VP8Encoder* const enc) {
enc->use_layer_ = (enc->pic_->u0 != NULL);
enc->layer_data_size_ = 0;
enc->layer_data_ = NULL;
if (enc->use_layer_) {
VP8BitWriterInit(&enc->layer_bw_, enc->mb_w_ * enc->mb_h_ * 3);
}
}
void VP8EncCodeLayerBlock(VP8EncIterator* it) {
(void)it; // remove a warning
#ifdef WEBP_EXPERIMENTAL_FEATURES
#endif /* WEBP_EXPERIMENTAL_FEATURES */
}
int VP8EncFinishLayer(VP8Encoder* const enc) {
if (enc->use_layer_) {
enc->layer_data_ = VP8BitWriterFinish(&enc->layer_bw_);
enc->layer_data_size_ = VP8BitWriterSize(&enc->layer_bw_);
}
return 1;
}
void VP8EncDeleteLayer(VP8Encoder* enc) {
free(enc->layer_data_);
}
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

View File

@@ -9,6 +9,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "vp8enci.h"
@@ -16,54 +17,122 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// WebPPicture
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
int WebPPictureAlloc(WebPPicture* const picture) {
if (picture) {
const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
const int width = picture->width;
const int height = picture->height;
const int y_stride = width;
const int uv_width = (width + 1) / 2;
const int uv_height = (height + 1) / 2;
const uint64_t y_size = (uint64_t)width * height;
const uint64_t uv_size = (uint64_t)uv_width * uv_height;
const uint64_t total_size = y_size + 2 * uv_size;
const int uv_stride = uv_width;
int uv0_stride = 0;
int a_width, a_stride;
uint64_t y_size, uv_size, uv0_size, a_size, total_size;
uint8_t* mem;
// U/V
switch (uv_csp) {
case WEBP_YUV420:
break;
#ifdef WEBP_EXPERIMENTAL_FEATURES
case WEBP_YUV400: // for now, we'll just reset the U/V samples
break;
case WEBP_YUV422:
uv0_stride = uv_width;
break;
case WEBP_YUV444:
uv0_stride = width;
break;
#endif
default:
return 0;
}
uv0_size = height * uv0_stride;
// alpha
a_width = has_alpha ? width : 0;
a_stride = a_width;
y_size = (uint64_t)y_stride * height;
uv_size = (uint64_t)uv_stride * uv_height;
a_size = (uint64_t)a_stride * height;
total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
// Security and validation checks
if (uv_width <= 0 || uv_height <= 0 || // check param error
if (width <= 0 || height <= 0 || // check for luma/alpha param error
uv_width < 0 || uv_height < 0 || // check for u/v param error
y_size >= (1ULL << 40) || // check for reasonable global size
(size_t)total_size != total_size) { // check for overflow on 32bit
return 0;
}
picture->y_stride = width;
picture->uv_stride = uv_width;
picture->y_stride = y_stride;
picture->uv_stride = uv_stride;
picture->a_stride = a_stride;
picture->uv0_stride = uv0_stride;
WebPPictureFree(picture); // erase previous buffer
picture->y = (uint8_t*)malloc((size_t)total_size);
if (picture->y == NULL) return 0;
picture->u = picture->y + y_size;
picture->v = picture->u + uv_size;
mem = (uint8_t*)malloc((size_t)total_size);
if (mem == NULL) return 0;
picture->y = mem;
mem += y_size;
picture->u = mem;
mem += uv_size;
picture->v = mem;
mem += uv_size;
if (a_size) {
picture->a = mem;
mem += a_size;
}
if (uv0_size) {
picture->u0 = mem;
mem += uv0_size;
picture->v0 = mem;
mem += uv0_size;
}
}
return 1;
}
// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
// into 'dst'. Mark 'dst' as not owning any memory. 'src' can be NULL.
static void WebPPictureGrabSpecs(const WebPPicture* const src,
WebPPicture* const dst) {
if (src) *dst = *src;
dst->y = dst->u = dst->v = NULL;
dst->u0 = dst->v0 = NULL;
dst->a = NULL;
}
// Release memory owned by 'picture'.
void WebPPictureFree(WebPPicture* const picture) {
if (picture) {
free(picture->y);
picture->y = picture->u = picture->v = NULL;
WebPPictureGrabSpecs(NULL, picture);
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Picture copying
int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) {
int y;
if (src == NULL || dst == NULL) return 0;
if (src == dst) return 1;
*dst = *src;
dst->y = NULL;
WebPPictureGrabSpecs(src, dst);
if (!WebPPictureAlloc(dst)) return 0;
for (y = 0; y < dst->height; ++y) {
memcpy(dst->y + y * dst->y_stride, src->y + y * src->y_stride, src->width);
memcpy(dst->y + y * dst->y_stride,
src->y + y * src->y_stride, src->width);
}
for (y = 0; y < (dst->height + 1) / 2; ++y) {
memcpy(dst->u + y * dst->uv_stride,
@@ -71,9 +140,32 @@ int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) {
memcpy(dst->v + y * dst->uv_stride,
src->v + y * src->uv_stride, (src->width + 1) / 2);
}
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (dst->a != NULL) {
for (y = 0; y < dst->height; ++y) {
memcpy(dst->a + y * dst->a_stride,
src->a + y * src->a_stride, src->width);
}
}
if (dst->u0 != NULL) {
int uv0_width = src->width;
if ((dst->colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
uv0_width = (uv0_width + 1) / 2;
}
for (y = 0; y < dst->height; ++y) {
memcpy(dst->u0 + y * dst->uv0_stride,
src->u0 + y * src->uv0_stride, uv0_width);
memcpy(dst->v0 + y * dst->uv0_stride,
src->v0 + y * src->uv0_stride, uv0_width);
}
}
#endif
return 1;
}
//------------------------------------------------------------------------------
// Picture cropping
int WebPPictureCrop(WebPPicture* const pic,
int left, int top, int width, int height) {
WebPPicture tmp;
@@ -84,8 +176,7 @@ int WebPPictureCrop(WebPPicture* const pic,
if (left < 0 || ((left + width + 1) & ~1) > pic->width) return 0;
if (top < 0 || ((top + height + 1) & ~1) > pic->height) return 0;
tmp = *pic;
tmp.y = NULL;
WebPPictureGrabSpecs(pic, &tmp);
tmp.width = width;
tmp.height = height;
if (!WebPPictureAlloc(&tmp)) return 0;
@@ -99,12 +190,189 @@ int WebPPictureCrop(WebPPicture* const pic,
memcpy(tmp.u + y * tmp.uv_stride, pic->u + offset, (width + 1) / 2);
memcpy(tmp.v + y * tmp.uv_stride, pic->v + offset, (width + 1) / 2);
}
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (tmp.a) {
for (y = 0; y < height; ++y) {
memcpy(tmp.a + y * tmp.a_stride,
pic->a + (top + y) * pic->a_stride + left, width);
}
}
if (tmp.u0) {
int w = width;
int l = left;
if (tmp.colorspace == WEBP_YUV422) {
w = (w + 1) / 2;
l = (l + 1) / 2;
}
for (y = 0; y < height; ++y) {
memcpy(tmp.u0 + y * tmp.uv0_stride,
pic->u0 + (top + y) * pic->uv0_stride + l, w);
memcpy(tmp.v0 + y * tmp.uv0_stride,
pic->v0 + (top + y) * pic->uv0_stride + l, w);
}
}
#endif
WebPPictureFree(pic);
*pic = tmp;
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Simple picture rescaler
#define RFIX 30
#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
static inline void ImportRow(const uint8_t* src, int src_width,
int32_t* frow, int32_t* irow, int dst_width) {
const int x_expand = (src_width < dst_width);
const int fx_scale = (1 << RFIX) / dst_width;
int x_in = 0;
int x_out;
int x_accum = 0;
if (!x_expand) {
int sum = 0;
for (x_out = 0; x_out < dst_width; ++x_out) {
x_accum += src_width - dst_width;
for (; x_accum > 0; x_accum -= dst_width) {
sum += src[x_in++];
}
{ // Emit next horizontal pixel.
const int32_t base = src[x_in++];
const int32_t frac = base * (-x_accum);
frow[x_out] = (sum + base) * dst_width - frac;
sum = MULT(frac, fx_scale); // fresh fractional start for next pixel
}
}
} else { // simple bilinear interpolation
int left = src[0], right = src[0];
for (x_out = 0; x_out < dst_width; ++x_out) {
if (x_accum < 0) {
left = right;
right = src[++x_in];
x_accum += dst_width - 1;
}
frow[x_out] = right * (dst_width - 1) + (left - right) * x_accum;
x_accum -= src_width - 1;
}
}
// Accumulate the new row's contribution
for (x_out = 0; x_out < dst_width; ++x_out) {
irow[x_out] += frow[x_out];
}
}
static void ExportRow(int32_t* frow, int32_t* irow, uint8_t* dst, int dst_width,
const int yscale, const int64_t fxy_scale) {
int x_out;
for (x_out = 0; x_out < dst_width; ++x_out) {
const int frac = MULT(frow[x_out], yscale);
const int v = (int)(MULT(irow[x_out] - frac, fxy_scale));
dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
irow[x_out] = frac; // new fractional start
}
}
static void RescalePlane(const uint8_t* src,
int src_width, int src_height, int src_stride,
uint8_t* dst,
int dst_width, int dst_height, int dst_stride,
int32_t* const work) {
const int x_expand = (src_width < dst_width);
const int fy_scale = (1 << RFIX) / dst_height;
const int64_t fxy_scale = x_expand ?
((int64_t)dst_height << RFIX) / (dst_width * src_height) :
((int64_t)dst_height << RFIX) / (src_width * src_height);
int y_accum = src_height;
int y;
int32_t* irow = work; // integral contribution
int32_t* frow = work + dst_width; // fractional contribution
memset(work, 0, 2 * dst_width * sizeof(*work));
for (y = 0; y < src_height; ++y) {
// import new contribution of one source row.
ImportRow(src, src_width, frow, irow, dst_width);
src += src_stride;
// emit output row(s)
y_accum -= dst_height;
for (; y_accum <= 0; y_accum += src_height) {
const int yscale = fy_scale * (-y_accum);
ExportRow(frow, irow, dst, dst_width, yscale, fxy_scale);
dst += dst_stride;
}
}
}
#undef MULT
#undef RFIX
int WebPPictureRescale(WebPPicture* const pic, int width, int height) {
WebPPicture tmp;
int prev_width, prev_height;
int32_t* work;
if (pic == NULL) return 0;
prev_width = pic->width;
prev_height = pic->height;
// if width is unspecified, scale original proportionally to height ratio.
if (width == 0) {
width = (prev_width * height + prev_height / 2) / prev_height;
}
// if height is unspecified, scale original proportionally to width ratio.
if (height == 0) {
height = (prev_height * width + prev_width / 2) / prev_width;
}
// Check if the overall dimensions still make sense.
if (width <= 0 || height <= 0) return 0;
WebPPictureGrabSpecs(pic, &tmp);
tmp.width = width;
tmp.height = height;
if (!WebPPictureAlloc(&tmp)) return 0;
work = malloc(2 * width * sizeof(int32_t));
if (work == NULL) {
WebPPictureFree(&tmp);
return 0;
}
RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
tmp.y, width, height, tmp.y_stride, work);
RescalePlane(pic->u,
(prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride,
tmp.u,
(width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work);
RescalePlane(pic->v,
(prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride,
tmp.v,
(width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work);
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (tmp.a) {
RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
tmp.a, width, height, tmp.a_stride, work);
}
if (tmp.u0) {
int s = 1;
if ((tmp.colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
s = 2;
}
RescalePlane(
pic->u0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
tmp.u0, (width + s / 2) / s, height, tmp.uv0_stride, work);
RescalePlane(
pic->v0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
tmp.v0, (width + s / 2) / s, height, tmp.uv0_stride, work);
}
#endif
WebPPictureFree(pic);
free(work);
*pic = tmp;
return 1;
}
//------------------------------------------------------------------------------
// Write-to-memory
typedef struct {
@@ -150,7 +418,7 @@ static int WebPMemoryWrite(const uint8_t* data, size_t data_size,
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// RGB -> YUV conversion
// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
// More information at: http://en.wikipedia.org/wiki/YCbCr
@@ -196,36 +464,98 @@ static inline int rgb_to_v(int r, int g, int b) {
picture->v[dst] = rgb_to_v(r, g, b); \
}
#define RGB_TO_UV0(x_in, x_out, y, SUM) { \
const int src = (step * (x_in) + (y) * rgb_stride); \
const int dst = (x_out) + (y) * picture->uv0_stride; \
const int r = SUM(r_ptr + src); \
const int g = SUM(g_ptr + src); \
const int b = SUM(b_ptr + src); \
picture->u0[dst] = rgb_to_u(r, g, b); \
picture->v0[dst] = rgb_to_v(r, g, b); \
}
static void MakeGray(WebPPicture* const picture) {
int y;
const int uv_width = (picture->width + 1) >> 1;
for (y = 0; y < ((picture->height + 1) >> 1); ++y) {
memset(picture->u + y * picture->uv_stride, 128, uv_width);
memset(picture->v + y * picture->uv_stride, 128, uv_width);
}
}
static int Import(WebPPicture* const picture,
const uint8_t* const rgb, int rgb_stride,
int step, int swap) {
int step, int swap_rb, int import_alpha) {
const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
int x, y;
const uint8_t* const r_ptr = rgb + (swap ? 2 : 0);
const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
const uint8_t* const g_ptr = rgb + 1;
const uint8_t* const b_ptr = rgb + (swap ? 0 : 2);
const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
const int width = picture->width;
const int height = picture->height;
for (y = 0; y < picture->height; ++y) {
for (x = 0; x < picture->width; ++x) {
// Import luma plane
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
const int offset = step * x + y * rgb_stride;
picture->y[x + y * picture->y_stride] =
rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
}
}
for (y = 0; y < (picture->height >> 1); ++y) {
for (x = 0; x < (picture->width >> 1); ++x) {
RGB_TO_UV(x, y, SUM4);
// Downsample U/V plane
if (uv_csp != WEBP_YUV400) {
for (y = 0; y < (height >> 1); ++y) {
for (x = 0; x < (width >> 1); ++x) {
RGB_TO_UV(x, y, SUM4);
}
if (picture->width & 1) {
RGB_TO_UV(x, y, SUM2V);
}
}
if (picture->width & 1) {
RGB_TO_UV(x, y, SUM2V);
if (height & 1) {
for (x = 0; x < (width >> 1); ++x) {
RGB_TO_UV(x, y, SUM2H);
}
if (width & 1) {
RGB_TO_UV(x, y, SUM1);
}
}
#ifdef WEBP_EXPERIMENTAL_FEATURES
// Store original U/V samples too
if (uv_csp == WEBP_YUV422) {
for (y = 0; y < height; ++y) {
for (x = 0; x < (width >> 1); ++x) {
RGB_TO_UV0(2 * x, x, y, SUM2H);
}
if (width & 1) {
RGB_TO_UV0(2 * x, x, y, SUM1);
}
}
} else if (uv_csp == WEBP_YUV444) {
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
RGB_TO_UV0(x, x, y, SUM1);
}
}
}
#endif
} else {
MakeGray(picture);
}
if (picture->height & 1) {
for (x = 0; x < (picture->width >> 1); ++x) {
RGB_TO_UV(x, y, SUM2H);
}
if (picture->width & 1) {
RGB_TO_UV(x, y, SUM1);
if (import_alpha) {
#ifdef WEBP_EXPERIMENTAL_FEATURES
const uint8_t* const a_ptr = rgb + 3;
assert(step >= 4);
for (y = 0; y < height; ++y) {
for (x = 0; x < width; ++x) {
picture->a[x + y * picture->a_stride] =
a_ptr[step * x + y * rgb_stride];
}
}
#endif
}
return 1;
}
@@ -237,34 +567,38 @@ static int Import(WebPPicture* const picture,
int WebPPictureImportRGB(WebPPicture* const picture,
const uint8_t* const rgb, int rgb_stride) {
picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
if (!WebPPictureAlloc(picture)) return 0;
return Import(picture, rgb, rgb_stride, 3, 0);
return Import(picture, rgb, rgb_stride, 3, 0, 0);
}
int WebPPictureImportBGR(WebPPicture* const picture,
const uint8_t* const rgb, int rgb_stride) {
picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
if (!WebPPictureAlloc(picture)) return 0;
return Import(picture, rgb, rgb_stride, 3, 1);
return Import(picture, rgb, rgb_stride, 3, 1, 0);
}
int WebPPictureImportRGBA(WebPPicture* const picture,
const uint8_t* const rgba, int rgba_stride) {
picture->colorspace |= WEBP_CSP_ALPHA_BIT;
if (!WebPPictureAlloc(picture)) return 0;
return Import(picture, rgba, rgba_stride, 4, 0);
return Import(picture, rgba, rgba_stride, 4, 0, 1);
}
int WebPPictureImportBGRA(WebPPicture* const picture,
const uint8_t* const rgba, int rgba_stride) {
picture->colorspace |= WEBP_CSP_ALPHA_BIT;
if (!WebPPictureAlloc(picture)) return 0;
return Import(picture, rgba, rgba_stride, 4, 1);
return Import(picture, rgba, rgba_stride, 4, 1, 1);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Simplest call:
typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int);
static size_t Encode(const uint8_t* rgb, int width, int height, int stride,
static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
Importer import, float quality_factor, uint8_t** output) {
size_t output_size = 0;
WebPPicture pic;
@@ -286,7 +620,7 @@ static size_t Encode(const uint8_t* rgb, int width, int height, int stride,
wrt.size = &output_size;
InitMemoryWriter(&wrt);
ok = import(&pic, rgb, stride) && WebPEncode(&config, &pic);
ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
WebPPictureFree(&pic);
if (!ok) {
free(*output);
@@ -309,7 +643,7 @@ ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA);
#undef ENCODE_FUNC
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -33,13 +33,13 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static inline int clip(int v, int m, int M) {
return v < m ? m : v > M ? M : v;
}
const uint8_t VP8Zigzag[16] = {
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
@@ -132,7 +132,7 @@ static const uint8_t kFreqSharpening[16] = {
90, 90, 90, 90
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Initialize quantization parameters in VP8Matrix
// Returns the average quantizer
@@ -143,7 +143,7 @@ static int ExpandMatrix(VP8Matrix* const m, int type) {
m->q_[i] = m->q_[1];
}
for (i = 0; i < 16; ++i) {
const int j = VP8Zigzag[i];
const int j = kZigzag[i];
const int bias = kBiasMatrices[type][j];
m->iq_[j] = (1 << QFIX) / m->q_[j];
m->bias_[j] = BIAS(bias);
@@ -192,7 +192,7 @@ static void SetupMatrices(VP8Encoder* enc) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Initialize filtering parameters
// Very small filter-strength values have close to no visual effect. So we can
@@ -214,7 +214,7 @@ static void SetupFilterStrength(VP8Encoder* const enc) {
enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Note: if you change the values below, remember that the max range
// allowed by the syntax for DQ_UV is [-16,16].
@@ -286,7 +286,7 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
SetupFilterStrength(enc); // initialize segments' filtering, eventually
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Form the predictions in cache
// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
@@ -316,7 +316,7 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Quantize
// Layout:
@@ -341,7 +341,7 @@ const int VP8Scan[16 + 4 + 4] = {
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Distortion measurement
static const uint16_t kWeightY[16] = {
@@ -384,7 +384,7 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
dst->score += src->score;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Performs trellis-optimized quantization.
// Trellis
@@ -440,7 +440,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
// compute maximal distortion.
max_error = 0;
for (n = first; n < 16; ++n) {
const int j = VP8Zigzag[n];
const int j = kZigzag[n];
const int err = in[j] * in[j];
max_error += kWeightTrellis[j] * err;
if (err > thresh) last = n;
@@ -464,7 +464,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
// traverse trellis.
for (n = first; n <= last; ++n) {
const int j = VP8Zigzag[n];
const int j = kZigzag[n];
const int Q = mtx->q_[j];
const int iQ = mtx->iq_[j];
const int B = BIAS(0x00); // neutral bias
@@ -560,7 +560,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
for (; n >= first; --n) {
const Node* const node = &NODE(n, best_node);
const int j = VP8Zigzag[n];
const int j = kZigzag[n];
out[n] = node->sign ? -node->level : node->level;
nz |= (node->level != 0);
in[j] = out[n] * mtx->q_[j];
@@ -571,7 +571,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
#undef NODE
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Performs: difference, transform, quantize, back-transform, add
// all at once. Output is the reconstructed block in *yuv_out, and the
// quantized levels in *levels.
@@ -615,8 +615,8 @@ static int ReconstructIntra16(VP8EncIterator* const it,
// Transform back
VP8ITransformWHT(dc_tmp, tmp[0]);
for (n = 0; n < 16; ++n) {
VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n]);
for (n = 0; n < 16; n += 2) {
VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
}
return nz;
@@ -642,7 +642,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
} else {
nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
}
VP8ITransform(ref, tmp, yuv_out);
VP8ITransform(ref, tmp, yuv_out, 0);
return nz;
}
@@ -666,8 +666,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
for (x = 0; x < 2; ++x, ++n) {
const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
const int non_zero =
TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2, &dqm->uv_,
dqm->lambda_trellis_uv_);
TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
&dqm->uv_, dqm->lambda_trellis_uv_);
it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
nz |= non_zero << n;
}
@@ -679,13 +679,13 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
}
}
for (n = 0; n < 8; ++n) {
VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n]);
for (n = 0; n < 8; n += 2) {
VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
}
return (nz << 16);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
// Pick the mode is lower RD-cost = Rate + lamba * Distortion.
@@ -738,7 +738,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
VP8SetIntra16Mode(it, rd->mode_i16);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// return the cost array corresponding to the surrounding prediction modes.
static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
@@ -757,10 +757,15 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
const int tlambda = dqm->tlambda_;
const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
int total_header_bits = 0;
VP8ModeScore rd_best;
if (enc->max_i4_header_bits_ == 0) {
return 0;
}
InitScore(&rd_best);
rd_best.score = 0;
rd_best.score = 211; // '211' is the value of VP8BitCost(0, 145)
VP8IteratorStartI4(it);
do {
VP8ModeScore rd_i4;
@@ -799,7 +804,9 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
}
SetRDScore(dqm->lambda_mode_, &rd_i4);
AddScore(&rd_best, &rd_i4);
if (rd_best.score >= rd->score) {
total_header_bits += mode_costs[best_mode];
if (rd_best.score >= rd->score ||
total_header_bits > enc->max_i4_header_bits_) {
return 0;
}
// Copy selected samples if not in the right place already.
@@ -817,7 +824,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
return 1; // select intra4x4 over intra16x16
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
VP8Encoder* const enc = it->enc_;
@@ -855,7 +862,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
AddScore(rd, &rd_best);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Final reconstruction and quantization.
static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
@@ -882,7 +889,7 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
rd->nz = nz;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Entry point
int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {

View File

@@ -26,7 +26,7 @@ extern "C" {
#define MAX_PARTITION0_SIZE (1 << 19) // max size of mode partition
#define MAX_PARTITION_SIZE (1 << 24) // max size for token partition
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Writers for header's various pieces (in order of appearance)
// Main keyframe header
@@ -39,26 +39,31 @@ static void PutLE32(uint8_t* const data, uint32_t val) {
}
static int PutHeader(int profile, size_t size0, size_t total_size,
const WebPPicture* const pic) {
WebPPicture* const pic) {
uint8_t buf[KHEADER_SIZE];
uint8_t RIFF[KRIFF_SIZE] = {
'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P', 'V', 'P', '8', ' '
};
uint32_t bits;
if (size0 >= MAX_PARTITION0_SIZE) {
return 0; // partition #0 is too big to fit
if (size0 >= MAX_PARTITION0_SIZE) { // partition #0 is too big to fit
return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION0_OVERFLOW);
}
PutLE32(RIFF + 4, total_size + KSIZE_OFFSET);
PutLE32(RIFF + 16, total_size);
if (!pic->writer(RIFF, sizeof(RIFF), pic))
return 0;
if (total_size > 0xfffffffeU - KRIFF_SIZE) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
}
bits = 0 // keyframe (1b)
| (profile << 1) // profile (3b)
| (1 << 4) // visible (1b)
| (size0 << 5); // partition length (19b)
PutLE32(RIFF + 4, (uint32_t)(total_size + KSIZE_OFFSET));
PutLE32(RIFF + 16, (uint32_t)total_size);
if (!pic->writer(RIFF, sizeof(RIFF), pic)) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
}
bits = 0 // keyframe (1b)
| (profile << 1) // profile (3b)
| (1 << 4) // visible (1b)
| ((uint32_t)size0 << 5); // partition length (19b)
buf[0] = bits & 0xff;
buf[1] = (bits >> 8) & 0xff;
buf[2] = (bits >> 16) & 0xff;
@@ -138,13 +143,13 @@ static void PutQuant(VP8BitWriter* const bw,
// Partition sizes
static int EmitPartitionsSize(const VP8Encoder* const enc,
const WebPPicture* const pic) {
WebPPicture* const pic) {
uint8_t buf[3 * (MAX_NUM_PARTITIONS - 1)];
int p;
for (p = 0; p < enc->num_parts_ - 1; ++p) {
const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
if (part_size >= MAX_PARTITION_SIZE) {
return 0; // partition is too big to fit
return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
}
buf[3 * p + 0] = (part_size >> 0) & 0xff;
buf[3 * p + 1] = (part_size >> 8) & 0xff;
@@ -153,16 +158,69 @@ static int EmitPartitionsSize(const VP8Encoder* const enc,
return p ? pic->writer(buf, 3 * p, pic) : 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#ifdef WEBP_EXPERIMENTAL_FEATURES
#define KTRAILER_SIZE 8
static void PutLE24(uint8_t* buf, size_t value) {
buf[0] = (value >> 0) & 0xff;
buf[1] = (value >> 8) & 0xff;
buf[2] = (value >> 16) & 0xff;
}
static int WriteExtensions(VP8Encoder* const enc) {
uint8_t buffer[KTRAILER_SIZE];
VP8BitWriter* const bw = &enc->bw_;
WebPPicture* const pic = enc->pic_;
// Layer (bytes 0..3)
PutLE24(buffer + 0, enc->layer_data_size_);
buffer[3] = enc->pic_->colorspace & WEBP_CSP_UV_MASK;
if (enc->layer_data_size_ > 0) {
assert(enc->use_layer_);
// append layer data to last partition
if (!VP8BitWriterAppend(&enc->parts_[enc->num_parts_ - 1],
enc->layer_data_, enc->layer_data_size_)) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
}
}
// Alpha (bytes 4..6)
PutLE24(buffer + 4, enc->alpha_data_size_);
if (enc->alpha_data_size_ > 0) {
assert(enc->has_alpha_);
if (!VP8BitWriterAppend(bw, enc->alpha_data_, enc->alpha_data_size_)) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
}
}
buffer[KTRAILER_SIZE - 1] = 0x01; // marker
if (!VP8BitWriterAppend(bw, buffer, KTRAILER_SIZE)) {
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
}
return 1;
}
#endif /* WEBP_EXPERIMENTAL_FEATURES */
//------------------------------------------------------------------------------
static size_t GeneratePartition0(VP8Encoder* const enc) {
VP8BitWriter* const bw = &enc->bw_;
const int mb_size = enc->mb_w_ * enc->mb_h_;
uint64_t pos1, pos2, pos3;
#ifdef WEBP_EXPERIMENTAL_FEATURES
const int need_extensions = enc->has_alpha_ || enc->use_layer_;
#endif
pos1 = VP8BitWriterPos(bw);
VP8BitWriterInit(bw, mb_size * 7 / 8); // ~7 bits per macroblock
#ifdef WEBP_EXPERIMENTAL_FEATURES
VP8PutBitUniform(bw, need_extensions); // extensions
#else
VP8PutBitUniform(bw, 0); // colorspace
#endif
VP8PutBitUniform(bw, 0); // clamp type
PutSegmentHeader(bw, enc);
@@ -174,11 +232,20 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
pos2 = VP8BitWriterPos(bw);
VP8CodeIntraModes(enc);
VP8BitWriterFinish(bw);
#ifdef WEBP_EXPERIMENTAL_FEATURES
if (need_extensions && !WriteExtensions(enc)) {
return 0;
}
#endif
pos3 = VP8BitWriterPos(bw);
if (enc->pic_->stats) {
enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
enc->pic_->stats->layer_data_size = (int)enc->layer_data_size_;
}
return !bw->error_;
}
@@ -191,7 +258,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
int p;
// Partition #0 with header and partition sizes
ok = GeneratePartition0(enc);
ok = !!GeneratePartition0(enc);
// Compute total size (for the RIFF header)
coded_size = KHEADER_SIZE + VP8BitWriterSize(bw) + 3 * (enc->num_parts_ - 1);
@@ -226,11 +293,11 @@ int VP8EncWrite(VP8Encoder* const enc) {
ok = pic->writer(pad_byte, 1, pic);
}
enc->coded_size_ = coded_size + KRIFF_SIZE;
enc->coded_size_ = (int)coded_size + KRIFF_SIZE;
return ok;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -15,7 +15,7 @@
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Default probabilities
// Paragraph 13.5
@@ -343,7 +343,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
} while (VP8IteratorNext(&it, 0));
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Paragraph 13
const uint8_t

View File

@@ -13,20 +13,24 @@
#define WEBP_ENC_VP8ENCI_H_
#include "string.h" // for memcpy()
#include "webp/encode.h"
#include "bit_writer.h"
#include "../webp/encode.h"
#include "../dsp/dsp.h"
#include "../utils/bit_writer.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Various defines and enums
// version numbers
#define ENC_MAJ_VERSION 0
#define ENC_MIN_VERSION 1
#define ENC_REV_VERSION 2
#define ENC_REV_VERSION 3
// size of histogram used by CollectHistogram.
#define MAX_COEFF_THRESH 64
// intra prediction modes
enum { B_DC_PRED = 0, // 4x4 modes
@@ -158,7 +162,7 @@ static inline int QUANTDIV(int n, int iQ, int B) {
}
extern const uint8_t VP8Zigzag[16];
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Headers
typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS];
@@ -184,7 +188,7 @@ typedef struct {
StatsArray stats_[NUM_TYPES][NUM_BANDS]; // 7.4k
CostArray level_cost_[NUM_TYPES][NUM_BANDS]; // 11.4k
int use_skip_proba_; // Note: we always use skip_proba for now.
int nb_skip_, nb_i4_, nb_i16_; // block type counters
int nb_skip_; // number of skipped blocks
} VP8Proba;
// Filter parameters. Not actually used in the code (we don't perform
@@ -196,19 +200,19 @@ typedef struct {
int i4x4_lf_delta_; // delta filter level for i4x4 relative to i16x16
} VP8FilterHeader;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Informations about the macroblocks.
typedef struct {
// block type
uint8_t type_:2; // 0=i4x4, 1=i16x16
uint8_t uv_mode_:2;
uint8_t skip_:1;
uint8_t segment_:2;
unsigned int type_:2; // 0=i4x4, 1=i16x16
unsigned int uv_mode_:2;
unsigned int skip_:1;
unsigned int segment_:2;
uint8_t alpha_; // quantization-susceptibility
} VP8MBInfo;
typedef struct {
typedef struct VP8Matrix {
uint16_t q_[16]; // quantizer steps
uint16_t iq_[16]; // reciprocals, fixed point.
uint16_t bias_[16]; // rounding bias
@@ -258,7 +262,7 @@ typedef struct {
uint8_t* preds_; // intra mode predictors (4x4 blocks)
uint32_t* nz_; // non-zero pattern
uint8_t i4_boundary_[37]; // 32+5 boundary samples needed by intra4x4
uint8_t* i4_top_; // pointer to the current *top boundary sample
uint8_t* i4_top_; // pointer to the current top boundary sample
int i4_; // current intra4x4 mode being tested
int top_nz_[9]; // top-non-zero context.
int left_nz_[9]; // left-non-zero. left_nz[8] is independent.
@@ -302,7 +306,7 @@ void VP8SetSkip(const VP8EncIterator* const it, int skip);
void VP8SetSegment(const VP8EncIterator* const it, int segment);
void VP8IteratorResetCosts(VP8EncIterator* const it);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// VP8Encoder
struct VP8Encoder {
@@ -326,6 +330,17 @@ struct VP8Encoder {
VP8BitWriter bw_; // part0
VP8BitWriter parts_[MAX_NUM_PARTITIONS]; // token partitions
// transparency blob
int has_alpha_;
uint8_t* alpha_data_; // non-NULL if transparency is present
size_t alpha_data_size_;
// enhancement layer
int use_layer_;
VP8BitWriter layer_bw_;
uint8_t* layer_data_;
size_t layer_data_size_;
// quantization info (one set of DC/AC dequant factor per segment)
VP8SegmentInfo dqm_[NUM_MB_SEGMENTS];
int base_quant_; // nominal quantizer value. Only used
@@ -345,8 +360,9 @@ struct VP8Encoder {
int block_count_[3];
// quality/speed settings
int method_; // 0=fastest, 6=best/slowest.
int rd_opt_level_; // Deduced from method_.
int method_; // 0=fastest, 6=best/slowest.
int rd_opt_level_; // Deduced from method_.
int max_i4_header_bits_; // partition #0 safeness factor
// Memory
VP8MBInfo* mb_info_; // contextual macroblock infos (mb_w_ + 1)
@@ -366,7 +382,7 @@ struct VP8Encoder {
LFStats *lf_stats_; // autofilter stats (if NULL, autofilter is off)
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// internal functions. Not public.
// in tree.c
@@ -403,6 +419,10 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd);
int VP8EncLoop(VP8Encoder* const enc);
int VP8StatLoop(VP8Encoder* const enc);
// in webpenc.c
// Assign an error code to a picture. Return false for convenience.
int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error);
// in analysis.c
// Main analysis loop. Decides the segmentations and complexity.
// Assigns a first guess for Intra16 and uvmode_ prediction modes.
@@ -414,58 +434,27 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
// Pick best modes and fills the levels. Returns true if skipped.
int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt);
// in dsp.c
// Transforms
typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst);
typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
extern VP8Idct VP8ITransform;
extern VP8Fdct VP8FTransform;
extern VP8WHT VP8ITransformWHT;
extern VP8WHT VP8FTransformWHT;
// Predictions
// *dst is the destination block. *top, *top_right and *left can be NULL.
typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
const uint8_t* top);
typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
extern VP8Intra4Preds VP8EncPredLuma4;
extern VP8IntraPreds VP8EncPredLuma16;
extern VP8IntraPreds VP8EncPredChroma8;
// in alpha.c
void VP8EncInitAlpha(VP8Encoder* enc); // initialize alpha compression
void VP8EncCodeAlphaBlock(VP8EncIterator* it); // analyze or code a macroblock
int VP8EncFinishAlpha(VP8Encoder* enc); // finalize compressed data
void VP8EncDeleteAlpha(VP8Encoder* enc); // delete compressed data
typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
const uint16_t* const weights);
extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
extern VP8BlockCopy VP8Copy4x4;
extern VP8BlockCopy VP8Copy8x8;
extern VP8BlockCopy VP8Copy16x16;
// Quantization
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
int n, const VP8Matrix* const mtx);
extern VP8QuantizeBlock VP8EncQuantizeBlock;
typedef enum {
kSSE2,
kSSE3
} CPUFeature;
// returns true if the CPU supports the feature.
typedef int (*VP8CPUInfo)(CPUFeature feature);
extern VP8CPUInfo CPUInfo;
void VP8EncDspInit(void); // must be called before using any of the above
// in layer.c
void VP8EncInitLayer(VP8Encoder* const enc); // init everything
void VP8EncCodeLayerBlock(VP8EncIterator* it); // code one more macroblock
int VP8EncFinishLayer(VP8Encoder* const enc); // finalize coding
void VP8EncDeleteLayer(VP8Encoder* enc); // reclaim memory
// in filter.c
extern void VP8InitFilter(VP8EncIterator* const it);
extern void VP8StoreFilterStats(VP8EncIterator* const it);
extern void VP8AdjustFilterStrength(VP8EncIterator* const it);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // WEBP_ENC_VP8ENCI_H_
#endif /* WEBP_ENC_VP8ENCI_H_ */

View File

@@ -9,6 +9,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
@@ -25,17 +26,15 @@ extern "C" {
#include <stdio.h>
#endif
#define MAX_DIMENSION 16384 // maximum width/height allowed by the spec
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
int WebPGetEncoderVersion(void) {
return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// WebPPicture
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static int DummyWriter(const uint8_t* data, size_t data_size,
const WebPPicture* const picture) {
@@ -53,13 +52,14 @@ int WebPPictureInitInternal(WebPPicture* const picture, int version) {
if (picture) {
memset(picture, 0, sizeof(*picture));
picture->writer = DummyWriter;
WebPEncodingSetError(picture, VP8_ENC_OK);
}
return 1;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// VP8Encoder
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static void ResetSegmentHeader(VP8Encoder* const enc) {
VP8SegmentHeader* const hdr = &enc->segment_hdr_;
@@ -110,11 +110,15 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
static void MapConfigToTools(VP8Encoder* const enc) {
const int method = enc->config_->method;
const int limit = 100 - enc->config_->partition_limit;
enc->method_ = method;
enc->rd_opt_level_ = (method >= 6) ? 3
: (method >= 5) ? 2
: (method >= 3) ? 1
: 0;
enc->max_i4_header_bits_ =
256 * 16 * 16 * // upper bound: up to 16bit per 4x4 block
(limit * limit) / (100 * 100); // ... modulated with a quadratic curve.
}
// Memory scaling with dimensions:
@@ -155,7 +159,8 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
16 + 16 + 16 + 8 + 1 + // left y/u/v
2 * ALIGN_CST) // align all
* sizeof(uint8_t);
const size_t lf_stats_size = config->autofilter ? sizeof(LFStats) : 0;
const size_t lf_stats_size =
config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
VP8Encoder* enc;
uint8_t* mem;
size_t size = sizeof(VP8Encoder) + ALIGN_CST // main struct
@@ -193,7 +198,10 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
printf("===================================\n");
#endif
mem = (uint8_t*)malloc(size);
if (mem == NULL) return NULL;
if (mem == NULL) {
WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
return NULL;
}
enc = (VP8Encoder*)mem;
mem = (uint8_t*)DO_ALIGN(mem + sizeof(*enc));
memset(enc, 0, sizeof(*enc));
@@ -215,7 +223,7 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
mem += preds_w * preds_h * sizeof(uint8_t);
enc->nz_ = 1 + (uint32_t*)mem;
mem += nz_size;
enc->lf_stats_ = lf_stats_size ? (LFStats*)mem : NULL;
enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
mem += lf_stats_size;
// top samples (all 16-aligned)
@@ -242,14 +250,25 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
ResetFilterHeader(enc);
ResetBoundaryPredictions(enc);
#ifdef WEBP_EXPERIMENTAL_FEATURES
VP8EncInitAlpha(enc);
VP8EncInitLayer(enc);
#endif
return enc;
}
static void DeleteEncoder(VP8Encoder* enc) {
free(enc);
if (enc) {
#ifdef WEBP_EXPERIMENTAL_FEATURES
VP8EncDeleteAlpha(enc);
VP8EncDeleteLayer(enc);
#endif
free(enc);
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
static double GetPSNR(uint64_t err, uint64_t size) {
return err ? 10. * log10(255. * 255. * size / err) : 99.;
@@ -284,31 +303,46 @@ static void StoreStats(VP8Encoder* const enc) {
}
}
//-----------------------------------------------------------------------------
int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error) {
assert((int)error <= VP8_ENC_ERROR_BAD_WRITE);
assert((int)error >= VP8_ENC_OK);
pic->error_code = error;
return 0;
}
//------------------------------------------------------------------------------
int WebPEncode(const WebPConfig* const config, WebPPicture* const pic) {
VP8Encoder* enc;
int ok;
if (config == NULL || pic == NULL)
return 0; // bad params
if (pic == NULL)
return 0;
WebPEncodingSetError(pic, VP8_ENC_OK); // all ok so far
if (config == NULL) // bad params
return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
if (!WebPValidateConfig(config))
return 0; // invalid config.
return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
if (pic->width <= 0 || pic->height <= 0)
return 0; // invalid parameters
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
if (pic->y == NULL || pic->u == NULL || pic->v == NULL)
return 0; // invalid parameters
if (pic->width >= MAX_DIMENSION || pic->height >= MAX_DIMENSION)
return 0; // image is too big
return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
enc = InitEncoder(config, pic);
if (enc == NULL) return 0;
if (enc == NULL) return 0; // pic->error is already set.
ok = VP8EncAnalyze(enc)
&& VP8StatLoop(enc)
&& VP8EncLoop(enc)
#ifdef WEBP_EXPERIMENTAL_FEATURES
&& VP8EncFinishAlpha(enc)
&& VP8EncFinishLayer(enc)
#endif
&& VP8EncWrite(enc);
StoreStats(enc);
DeleteEncoder(enc);
return ok;
}

13
src/utils/Makefile.am Normal file
View File

@@ -0,0 +1,13 @@
AM_CPPFLAGS = -I$(top_srcdir)/src
libwebputils_la_SOURCES = bit_reader.h bit_reader.c \
bit_writer.h bit_writer.c \
thread.h thread.c
libwebputils_la_LDFLAGS = -version-info 0:0:0
libwebputils_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
libwebputilsinclude_HEADERS = ../webp/types.h
libwebputilsincludedir = $(includedir)/webp
noinst_HEADERS = bit_reader.h bit_writer.h thread.h
noinst_LTLIBRARIES = libwebputils.la

View File

@@ -9,13 +9,13 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "bits.h"
#include "./bit_reader.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// VP8BitReader
void VP8InitBitReader(VP8BitReader* const br,
@@ -56,7 +56,7 @@ const uint8_t kVP8NewRange[128] = {
241, 243, 245, 247, 249, 251, 253, 127
};
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Higher-level calls
uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
@@ -72,7 +72,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
return VP8Get(br) ? -value : value;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -9,20 +9,21 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_BITS_H_
#define WEBP_DEC_BITS_H_
#ifndef WEBP_UTILS_BIT_READER_H_
#define WEBP_UTILS_BIT_READER_H_
#include <assert.h>
#include "webp/decode_vp8.h"
#include "../webp/decode_vp8.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Bitreader and code-tree reader
typedef struct {
typedef struct VP8BitReader VP8BitReader;
struct VP8BitReader {
const uint8_t* buf_; // next byte to be read
const uint8_t* buf_end_; // end of read buffer
int eof_; // true if input is exhausted
@@ -31,7 +32,7 @@ typedef struct {
uint32_t range_; // current range minus 1. In [127, 254] interval.
uint32_t value_; // current value
int missing_; // number of missing bits in value_ (8bit)
} VP8BitReader;
};
// Initialize the bit reader and the boolean decoder.
void VP8InitBitReader(VP8BitReader* const br,
@@ -61,15 +62,16 @@ static inline uint32_t VP8GetByte(VP8BitReader* const br) {
static inline uint32_t VP8BitUpdate(VP8BitReader* const br, uint32_t split) {
uint32_t bit;
const uint32_t value_split = (split + 1) << 8;
// Make sure we have a least 8 bits in 'value_'
if (br->missing_ > 0) {
br->value_ |= VP8GetByte(br) << br->missing_;
br->missing_ -= 8;
}
bit = ((br->value_ >> 8) > split);
bit = (br->value_ >= value_split);
if (bit) {
br->range_ -= split + 1;
br->value_ -= (split + 1) << 8;
br->value_ -= value_split;
} else {
br->range_ = split;
}
@@ -104,4 +106,4 @@ static inline int VP8GetSigned(VP8BitReader* const br, int v) {
} // extern "C"
#endif
#endif // WEBP_DEC_BITS_H_
#endif /* WEBP_UTILS_BIT_READER_H_ */

View File

@@ -10,14 +10,15 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <string.h> // for memcpy()
#include <stdlib.h>
#include "vp8enci.h"
#include "./bit_writer.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// VP8BitWriter
static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
@@ -68,7 +69,7 @@ static void kFlush(VP8BitWriter* const bw) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// renormalization
static const uint8_t kNorm[128] = { // renorm_sizes[i] = 8 - log2(i)
@@ -84,7 +85,7 @@ static const uint8_t kNorm[128] = { // renorm_sizes[i] = 8 - log2(i)
};
// range = ((range + 1) << kVP8Log2Range[range]) - 1
const uint8_t kNewRange[128] = {
static const uint8_t kNewRange[128] = {
127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
@@ -147,7 +148,7 @@ void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits) {
}
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
bw->range_ = 255 - 1;
@@ -168,7 +169,17 @@ uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
return bw->buf_;
}
//-----------------------------------------------------------------------------
int VP8BitWriterAppend(VP8BitWriter* const bw,
const uint8_t* data, size_t size) {
assert(data);
if (bw->nb_bits_ != -8) return 0; // kFlush() must have been called
if (!BitWriterResize(bw, size)) return 0;
memcpy(bw->buf_ + bw->pos_, data, size);
bw->pos_ += size;
return 1;
}
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -9,16 +9,16 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_ENC_BIT_WRITER_H_
#define WEBP_ENC_BIT_WRITER_H_
#ifndef WEBP_UTILS_BIT_WRITER_H_
#define WEBP_UTILS_BIT_WRITER_H_
#include "vp8enci.h"
#include "../webp/types.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Bit-writing
typedef struct VP8BitWriter VP8BitWriter;
@@ -39,6 +39,8 @@ int VP8PutBit(VP8BitWriter* const bw, int bit, int prob);
int VP8PutBitUniform(VP8BitWriter* const bw, int bit);
void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits);
void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits);
int VP8BitWriterAppend(VP8BitWriter* const bw,
const uint8_t* data, size_t size);
// return approximate write position (in bits)
static inline uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) {
@@ -52,10 +54,10 @@ static inline size_t VP8BitWriterSize(const VP8BitWriter* const bw) {
return bw->pos_;
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif // WEBP_ENC_BIT_WRITER_H_
#endif /* WEBP_UTILS_BIT_WRITER_H_ */

243
src/utils/thread.c Normal file
View File

@@ -0,0 +1,243 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Multi-threaded worker
//
// Author: skal@google.com (Pascal Massimino)
#include <assert.h>
#include <string.h> // for memset()
#include "./thread.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#ifdef WEBP_USE_THREAD
#if defined(_WIN32)
//------------------------------------------------------------------------------
// simplistic pthread emulation layer
#include <process.h>
// _beginthreadex requires __stdcall
#define THREADFN unsigned int __stdcall
#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
static int pthread_create(pthread_t* const thread, const void* attr,
unsigned int (__stdcall *start)(void*), void* arg) {
(void)attr;
*thread = (pthread_t)_beginthreadex(NULL, /* void *security */
0, /* unsigned stack_size */
start,
arg,
0, /* unsigned initflag */
NULL); /* unsigned *thrdaddr */
if (*thread == NULL) return 1;
SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
return 0;
}
static int pthread_join(pthread_t thread, void** value_ptr) {
(void)value_ptr;
return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
CloseHandle(thread) == 0);
}
// Mutex
static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) {
(void)mutexattr;
InitializeCriticalSection(mutex);
return 0;
}
static int pthread_mutex_lock(pthread_mutex_t* const mutex) {
EnterCriticalSection(mutex);
return 0;
}
static int pthread_mutex_unlock(pthread_mutex_t* const mutex) {
LeaveCriticalSection(mutex);
return 0;
}
static int pthread_mutex_destroy(pthread_mutex_t* const mutex) {
DeleteCriticalSection(mutex);
return 0;
}
// Condition
static int pthread_cond_destroy(pthread_cond_t* const condition) {
int ok = 1;
ok &= (CloseHandle(condition->waiting_sem_) != 0);
ok &= (CloseHandle(condition->received_sem_) != 0);
ok &= (CloseHandle(condition->signal_event_) != 0);
return !ok;
}
static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) {
(void)cond_attr;
condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
if (condition->waiting_sem_ == NULL ||
condition->received_sem_ == NULL ||
condition->signal_event_ == NULL) {
pthread_cond_destroy(condition);
return 1;
}
return 0;
}
static int pthread_cond_signal(pthread_cond_t* const condition) {
int ok = 1;
if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
// a thread is waiting in pthread_cond_wait: allow it to be notified
ok = SetEvent(condition->signal_event_);
// wait until the event is consumed so the signaler cannot consume
// the event via its own pthread_cond_wait.
ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
WAIT_OBJECT_0);
}
return !ok;
}
static int pthread_cond_wait(pthread_cond_t* const condition,
pthread_mutex_t* const mutex) {
int ok;
// note that there is a consumer available so the signal isn't dropped in
// pthread_cond_signal
if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
return 1;
// now unlock the mutex so pthread_cond_signal may be issued
pthread_mutex_unlock(mutex);
ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
WAIT_OBJECT_0);
ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
pthread_mutex_lock(mutex);
return !ok;
}
#else // _WIN32
# define THREADFN void*
# define THREAD_RETURN(val) val
#endif
//------------------------------------------------------------------------------
static THREADFN WebPWorkerThreadLoop(void *ptr) { // thread loop
WebPWorker* const worker = (WebPWorker*)ptr;
int done = 0;
while (!done) {
pthread_mutex_lock(&worker->mutex_);
while (worker->status_ == OK) { // wait in idling mode
pthread_cond_wait(&worker->condition_, &worker->mutex_);
}
if (worker->status_ == WORK) {
if (worker->hook) {
worker->had_error |= !worker->hook(worker->data1, worker->data2);
}
worker->status_ = OK;
} else if (worker->status_ == NOT_OK) { // finish the worker
done = 1;
}
// signal to the main thread that we're done (for Sync())
pthread_cond_signal(&worker->condition_);
pthread_mutex_unlock(&worker->mutex_);
}
return THREAD_RETURN(NULL); // Thread is finished
}
// main thread state control
static void WebPWorkerChangeState(WebPWorker* const worker,
WebPWorkerStatus new_status) {
// no-op when attempting to change state on a thread that didn't come up
if (worker->status_ < OK) return;
pthread_mutex_lock(&worker->mutex_);
// wait for the worker to finish
while (worker->status_ != OK) {
pthread_cond_wait(&worker->condition_, &worker->mutex_);
}
// assign new status and release the working thread if needed
if (new_status != OK) {
worker->status_ = new_status;
pthread_cond_signal(&worker->condition_);
}
pthread_mutex_unlock(&worker->mutex_);
}
#endif
//------------------------------------------------------------------------------
void WebPWorkerInit(WebPWorker* const worker) {
memset(worker, 0, sizeof(*worker));
worker->status_ = NOT_OK;
}
int WebPWorkerSync(WebPWorker* const worker) {
#ifdef WEBP_USE_THREAD
WebPWorkerChangeState(worker, OK);
#endif
assert(worker->status_ <= OK);
return !worker->had_error;
}
int WebPWorkerReset(WebPWorker* const worker) {
int ok = 1;
worker->had_error = 0;
if (worker->status_ < OK) {
#ifdef WEBP_USE_THREAD
if (pthread_mutex_init(&worker->mutex_, NULL) ||
pthread_cond_init(&worker->condition_, NULL)) {
return 0;
}
pthread_mutex_lock(&worker->mutex_);
ok = !pthread_create(&worker->thread_, NULL, WebPWorkerThreadLoop, worker);
if (ok) worker->status_ = OK;
pthread_mutex_unlock(&worker->mutex_);
#else
worker->status_ = OK;
#endif
} else if (worker->status_ > OK) {
ok = WebPWorkerSync(worker);
}
assert(!ok || (worker->status_ == OK));
return ok;
}
void WebPWorkerLaunch(WebPWorker* const worker) {
#ifdef WEBP_USE_THREAD
WebPWorkerChangeState(worker, WORK);
#else
if (worker->hook)
worker->had_error |= !worker->hook(worker->data1, worker->data2);
#endif
}
void WebPWorkerEnd(WebPWorker* const worker) {
if (worker->status_ >= OK) {
#ifdef WEBP_USE_THREAD
WebPWorkerChangeState(worker, NOT_OK);
pthread_join(worker->thread_, NULL);
pthread_mutex_destroy(&worker->mutex_);
pthread_cond_destroy(&worker->condition_);
#else
worker->status_ = NOT_OK;
#endif
}
assert(worker->status_ == NOT_OK);
}
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif

86
src/utils/thread.h Normal file
View File

@@ -0,0 +1,86 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// Multi-threaded worker
//
// Author: skal@google.com (Pascal Massimino)
#ifndef WEBP_UTILS_THREAD_H_
#define WEBP_UTILS_THREAD_H_
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#if WEBP_USE_THREAD
#if defined(_WIN32)
#include <windows.h>
typedef HANDLE pthread_t;
typedef CRITICAL_SECTION pthread_mutex_t;
typedef struct {
HANDLE waiting_sem_;
HANDLE received_sem_;
HANDLE signal_event_;
} pthread_cond_t;
#else
#include <pthread.h>
#endif /* _WIN32 */
#endif /* WEBP_USE_THREAD */
// State of the worker thread object
typedef enum {
NOT_OK = 0, // object is unusable
OK, // ready to work
WORK // busy finishing the current task
} WebPWorkerStatus;
// Function to be called by the worker thread. Takes two opaque pointers as
// arguments (data1 and data2), and should return false in case of error.
typedef int (*WebPWorkerHook)(void*, void*);
// Synchronize object used to launch job in the worker thread
typedef struct {
#if WEBP_USE_THREAD
pthread_mutex_t mutex_;
pthread_cond_t condition_;
pthread_t thread_;
#endif
WebPWorkerStatus status_;
WebPWorkerHook hook; // hook to call
void* data1; // first argument passed to 'hook'
void* data2; // second argument passed to 'hook'
int had_error; // return value of the last call to 'hook'
} WebPWorker;
// Must be called first, before any other method.
void WebPWorkerInit(WebPWorker* const worker);
// Must be called initialize the object and spawn the thread. Re-entrant.
// Will potentially launch the thread. Returns false in case of error.
int WebPWorkerReset(WebPWorker* const worker);
// Make sure the previous work is finished. Returns true if worker->had_error
// was not set and not error condition was triggered by the working thread.
int WebPWorkerSync(WebPWorker* const worker);
// Trigger the thread to call hook() with data1 and data2 argument. These
// hook/data1/data2 can be changed at any time before calling this function,
// but not be changed afterward until the next call to WebPWorkerSync().
void WebPWorkerLaunch(WebPWorker* const worker);
// Kill the thread and terminate the object. To use the object again, one
// must call WebPWorkerReset() again.
void WebPWorkerEnd(WebPWorker* const worker);
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"
#endif
#endif /* WEBP_UTILS_THREAD_H_ */

View File

@@ -12,40 +12,46 @@
#ifndef WEBP_WEBP_DECODE_H_
#define WEBP_WEBP_DECODE_H_
#include "webp/types.h"
#include "./types.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#define WEBP_DECODER_ABI_VERSION 0x0002
// Return the decoder's version number, packed in hexadecimal using 8bits for
// each of major/minor/revision. E.g: v2.5.7 is 0x020507.
int WebPGetDecoderVersion(void);
WEBP_EXTERN(int) WebPGetDecoderVersion(void);
// Retrieve basic header information: width, height.
// This function will also validate the header and return 0 in
// case of formatting error.
// Pointers *width/*height can be passed NULL if deemed irrelevant.
int WebPGetInfo(const uint8_t* data, uint32_t data_size,
int *width, int *height);
WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, uint32_t data_size,
int* width, int* height);
// Decodes WEBP images pointed to by *data and returns RGB samples, along
// with the dimensions in *width and *height.
// The returned pointer should be deleted calling free().
// Returns NULL in case of error.
uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
int *width, int *height);
WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
int* width, int* height);
// Same as WebPDecodeRGB, but returning RGBA data.
uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
int *width, int *height);
WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
int* width, int* height);
// Same as WebPDecodeRGBA, but returning ARGB data.
WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, uint32_t data_size,
int* width, int* height);
// This variant decode to BGR instead of RGB.
uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
int *width, int *height);
WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
int* width, int* height);
// This variant decodes to BGRA instead of RGBA.
uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
int *width, int *height);
WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
int* width, int* height);
// Decode WEBP images stored in *data in Y'UV format(*). The pointer returned is
// the Y samples buffer. Upon return, *u and *v will point to the U and V
@@ -56,11 +62,12 @@ uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
// have a common stride returned as '*uv_stride'.
// Return NULL in case of error.
// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
int *width, int *height, uint8_t** u, uint8_t** v,
int *stride, int* uv_stride);
WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
int* width, int* height,
uint8_t** u, uint8_t** v,
int* stride, int* uv_stride);
// These three functions are variants of the above ones, that decode the image
// These five functions are variants of the above ones, that decode the image
// directly into a pre-allocated buffer 'output_buffer'. The maximum storage
// available in this buffer is indicated by 'output_buffer_size'. If this
// storage is not sufficient (or an error occurred), NULL is returned.
@@ -68,19 +75,22 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
// The parameter 'output_stride' specifies the distance (in bytes)
// between scanlines. Hence, output_buffer_size is expected to be at least
// output_stride x picture-height.
uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size,
int output_stride);
uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size,
int output_stride);
WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size, int output_stride);
WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto(
const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size, int output_stride);
WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto(
const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size, int output_stride);
// BGR variants
uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size,
int output_stride);
uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size,
int output_stride);
WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size, int output_stride);
WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto(
const uint8_t* data, uint32_t data_size,
uint8_t* output_buffer, int output_buffer_size, int output_stride);
// WebPDecodeYUVInto() is a variant of WebPDecodeYUV() that operates directly
// into pre-allocated luma/chroma plane buffers. This function requires the
@@ -89,19 +99,72 @@ uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
// 'u_size' and 'v_size' respectively.
// Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
// during decoding (or because some buffers were found to be too small).
uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
uint8_t* luma, int luma_size, int luma_stride,
uint8_t* u, int u_size, int u_stride,
uint8_t* v, int v_size, int v_stride);
WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
const uint8_t* data, uint32_t data_size,
uint8_t* luma, int luma_size, int luma_stride,
uint8_t* u, int u_size, int u_stride,
uint8_t* v, int v_size, int v_stride);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Output colorspaces and buffer
// Output colorspaces
// Colorspaces
typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
MODE_BGR = 2, MODE_BGRA = 3,
MODE_YUV = 4 } WEBP_CSP_MODE;
MODE_ARGB = 4, MODE_RGBA_4444 = 5,
MODE_RGB_565 = 6,
// YUV modes must come after RGB ones.
MODE_YUV = 7, MODE_YUVA = 8, // yuv 4:2:0
MODE_LAST = 9
} WEBP_CSP_MODE;
// Generic structure for describing the sample buffer.
typedef struct { // view as RGBA
uint8_t* rgba; // pointer to RGBA samples
int stride; // stride in bytes from one scanline to the next.
int size; // total size of the *rgba buffer.
} WebPRGBABuffer;
typedef struct { // view as YUVA
uint8_t* y, *u, *v, *a; // pointer to luma, chroma U/V, alpha samples
int y_stride; // luma stride
int u_stride, v_stride; // chroma strides
int a_stride; // alpha stride
int y_size; // luma plane size
int u_size, v_size; // chroma planes size
int a_size; // alpha-plane size
} WebPYUVABuffer;
// Output buffer
typedef struct {
WEBP_CSP_MODE colorspace; // Colorspace.
int width, height; // Dimensions.
int is_external_memory; // If true, 'internal_memory' pointer is not used.
union {
WebPRGBABuffer RGBA;
WebPYUVABuffer YUVA;
} u; // Nameless union of buffer parameters.
uint8_t* private_memory; // Internally allocated memory (only when
// is_external_memory is false). Should not be used
// externally, but accessed via the buffer union.
} WebPDecBuffer;
// Internal, version-checked, entry point
WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer* const, int);
// Initialize the structure as empty. Must be called before any other use.
// Returns false in case of version mismatch
static inline int WebPInitDecBuffer(WebPDecBuffer* const buffer) {
return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
}
// Free any memory associated with the buffer. Must always be called last.
// Note: doesn't free the 'buffer' structure itself.
WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* const buffer);
//------------------------------------------------------------------------------
// Enumeration of the status codes
typedef enum {
VP8_STATUS_OK = 0,
VP8_STATUS_OUT_OF_MEMORY,
@@ -113,11 +176,11 @@ typedef enum {
VP8_STATUS_NOT_ENOUGH_DATA
} VP8StatusCode;
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Incremental decoding
//
// This API allows streamlined decoding of partial data.
// Picture can be incrementally decoded as data become available thanks to the
// This API allows streamlined decoding of partial data.
// Picture can be incrementally decoded as data become available thanks to the
// WebPIDecoder object. This object can be left in a SUSPENDED state if the
// picture is only partially decoded, pending additional input.
// Code example:
@@ -138,16 +201,26 @@ typedef enum {
typedef struct WebPIDecoder WebPIDecoder;
// Creates a new incremental decoder with the supplied buffer parameter.
// This output_buffer can be passed NULL, in which case a default output buffer
// is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer'
// is kept, which means that the lifespan of 'output_buffer' must be larger than
// that of the returned WebPIDecoder object.
// Returns NULL if the allocation failed.
WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* const output_buffer);
// Creates a WebPIDecoder object. Returns NULL in case of failure.
WebPIDecoder* WebPINew(WEBP_CSP_MODE mode);
// TODO(skal): DEPRECATED. Prefer using WebPINewDecoder().
WEBP_EXTERN(WebPIDecoder*) WebPINew(WEBP_CSP_MODE mode);
// This function allocates and initializes an incremental-decoder object, which
// will output the r/g/b(/a) samples specified by 'mode' into a preallocated
// buffer 'output_buffer'. The size of this buffer is at least
// 'output_buffer_size' and the stride (distance in bytes between two scanlines)
// is specified by 'output_stride'. Returns NULL if the allocation failed.
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
int output_buffer_size, int output_stride);
WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
WEBP_CSP_MODE mode,
uint8_t* output_buffer, int output_buffer_size, int output_stride);
// This function allocates and initializes an incremental-decoder object, which
// will output the raw luma/chroma samples into a preallocated planes. The luma
@@ -156,41 +229,165 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
// 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v', 'v_size'
// and 'v_size'.
// Returns NULL if the allocation failed.
WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
uint8_t* u, int u_size, int u_stride,
uint8_t* v, int v_size, int v_stride);
WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
uint8_t* luma, int luma_size, int luma_stride,
uint8_t* u, int u_size, int u_stride,
uint8_t* v, int v_size, int v_stride);
// Deletes the WebpBuffer object and associated memory. Must always be called
// Deletes the WebPIDecoder object and associated memory. Must always be called
// if WebPINew, WebPINewRGB or WebPINewYUV succeeded.
void WebPIDelete(WebPIDecoder* const idec);
WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* const idec);
// Copies and decodes the next available data. Returns VP8_STATUS_OK when
// the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more
// data is expected. Returns error in other cases.
VP8StatusCode WebPIAppend(WebPIDecoder* const idec, const uint8_t* data,
uint32_t data_size);
WEBP_EXTERN(VP8StatusCode) WebPIAppend(
WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size);
// A variant of the above function to be used when data buffer contains
// partial data from the beginning. In this case data buffer is not copied
// to the internal memory.
// Note that the value of the 'data' pointer can change between calls to
// WebPIUpdate, for instance when the data buffer is resized to fit larger data.
VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
uint32_t data_size);
WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size);
// Returns the RGB image decoded so far. Returns NULL if output params are not
// initialized yet. *last_y is the index of last decoded row in raster scan
// order. Some pointers (*last_y, *width etc.) can be NULL if corresponding
// information is not needed.
uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y,
int* width, int* height, int* stride);
// Returns the r/g/b/(a) image decoded so far. Returns NULL if output params
// are not initialized yet. The r/g/b/(a) output type corresponds to the mode
// specified in WebPINew()/WebPINewRGB(). *last_y is the index of last decoded
// row in raster scan order. Some pointers (*last_y, *width etc.) can be NULL if
// corresponding information is not needed.
WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
const WebPIDecoder* const idec, int* last_y,
int* width, int* height, int* stride);
// Same as above function to get YUV image. Returns pointer to the luma plane
// or NULL in case of error.
uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
uint8_t** u, uint8_t** v,
int* width, int* height, int* stride, int* uv_stride);
WEBP_EXTERN(uint8_t*) WebPIDecGetYUV(
const WebPIDecoder* const idec, int* last_y,
uint8_t** u, uint8_t** v,
int* width, int* height, int* stride, int* uv_stride);
// Generic call to retrieve information about the displayable area.
// If non NULL, the left/right/width/height pointers are filled with the visible
// rectangular area so far.
// Returns NULL in case the incremental decoder object is in an invalid state.
// Otherwise returns the pointer to the internal representation. This structure
// is read-only, tied to WebPIDecoder's lifespan and should not be modified.
WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
const WebPIDecoder* const idec,
int* const left, int* const top,
int* const width, int* const height);
//------------------------------------------------------------------------------
// Advanced decoding parametrization
//
// Code sample for using the advanced decoding API
/*
// A) Init a configuration object
WebPDecoderConfig config;
CHECK(WebPInitDecoderConfig(&config));
// B) optional: retrieve the bitstream's features.
CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
// C) Adjust 'config', if needed
config.no_fancy = 1;
config.output.colorspace = MODE_BGRA;
// etc.
// Note that you can also make config.output point to an externally
// supplied memory buffer, provided it's big enough to store the decoded
// picture. Otherwise, config.output will just be used to allocate memory
// and store the decoded picture.
// D) Decode!
CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
// E) Decoded image is now in config.output (and config.output.u.RGBA)
// F) Reclaim memory allocated in config's object. It's safe to call
// this function even if the memory is external and wasn't allocated
// by WebPDecode().
WebPFreeDecBuffer(&config.output);
*/
// Features gathered from the bitstream
typedef struct {
int width; // the original width, as read from the bitstream
int height; // the original height, as read from the bitstream
int has_alpha; // true if bitstream contains an alpha channel
int no_incremental_decoding; // if true, using incremental decoding is not
// recommended.
int rotate; // TODO(later)
int uv_sampling; // should be 0 for now. TODO(later)
int bitstream_version; // should be 0 for now. TODO(later)
} WebPBitstreamFeatures;
// Internal, version-checked, entry point
WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
const uint8_t*, uint32_t, WebPBitstreamFeatures* const, int);
// Retrieve features from the bitstream. The *features structure is filled
// with information gathered from the bitstream.
// Returns false in case of error or version mismatch.
// In case of error, features->bitstream_status will reflect the error code.
static inline
VP8StatusCode WebPGetFeatures(const uint8_t* data, uint32_t data_size,
WebPBitstreamFeatures* const features) {
return WebPGetFeaturesInternal(data, data_size, features,
WEBP_DECODER_ABI_VERSION);
}
// Decoding options
typedef struct {
int bypass_filtering; // if true, skip the in-loop filtering
int no_fancy_upsampling; // if true, use faster pointwise upsampler
int use_cropping; // if true, cropping is applied _first_
int crop_left, crop_top; // top-left position for cropping.
// Will be snapped to even values.
int crop_width, crop_height; // dimension of the cropping area
int use_scaling; // if true, scaling is applied _afterward_
int scaled_width, scaled_height; // final resolution
int force_rotation; // forced rotation (to be applied _last_)
int no_enhancement; // if true, discard enhancement layer
int use_threads; // if true, use multi-threaded decoding
} WebPDecoderOptions;
// Main object storing the configuration for advanced decoding.
typedef struct {
WebPBitstreamFeatures input; // Immutable bitstream features (optional)
WebPDecBuffer output; // Output buffer (can point to external mem)
WebPDecoderOptions options; // Decoding options
} WebPDecoderConfig;
// Internal, version-checked, entry point
WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig* const, int);
// Initialize the configuration as empty. This function must always be
// called first, unless WebPGetFeatures() is to be called.
// Returns false in case of mismatched version.
static inline int WebPInitDecoderConfig(WebPDecoderConfig* const config) {
return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
}
// Instantiate a new incremental decoder object with requested configuration.
// The bitstream can be passed using *data and data_size parameter,
// in which case the features will be parsed and stored into config->input.
// Otherwise, 'data' can be NULL and now parsing will occur.
// Note that 'config' can be NULL too, in which case a default configuration is
// used.
// The return WebPIDecoder object must always be deleted calling WebPIDelete().
// Returns NULL in case of error (and config->status will then reflect
// the error condition).
WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, uint32_t data_size,
WebPDecoderConfig* const config);
// Non-incremental version. This version decodes the full data at once, taking
// 'config' into account. Return decoding status (VP8_STATUS_OK if decoding
// was successful).
WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, uint32_t data_size,
WebPDecoderConfig* const config);
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -12,18 +12,16 @@
#ifndef WEBP_WEBP_DECODE_VP8_H_
#define WEBP_WEBP_DECODE_VP8_H_
#include "webp/decode.h"
#include "./decode.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#define WEBP_DECODER_ABI_VERSION 0x0001
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Lower-level API
//
// Thes functions provide fine-grained control of the decoding process.
// These functions provide fine-grained control of the decoding process.
// The call flow should resemble:
//
// VP8Io io;
@@ -40,14 +38,22 @@ extern "C" {
// Input / Output
typedef struct VP8Io VP8Io;
typedef int (*VP8IoPutHook)(const VP8Io* io);
typedef int (*VP8IoSetupHook)(VP8Io* io);
typedef void (*VP8IoTeardownHook)(const VP8Io* io);
struct VP8Io {
// set by VP8GetHeaders()
int width, height; // picture dimensions, in pixels
int width, height; // picture dimensions, in pixels (invariable).
// These are the original, uncropped dimensions.
// The actual area passed to put() is stored
// in mb_w / mb_h fields.
// set before calling put()
int mb_y; // position of the current rows (in pixels)
int mb_w; // number of columns in the sample
int mb_h; // number of rows in the sample
const uint8_t *y, *u, *v; // rows to copy (in yuv420 format)
const uint8_t* y, *u, *v; // rows to copy (in yuv420 format)
int y_stride; // row stride for luma
int uv_stride; // row stride for chroma
@@ -56,20 +62,24 @@ struct VP8Io {
// called when fresh samples are available. Currently, samples are in
// YUV420 format, and can be up to width x 24 in size (depending on the
// in-loop filtering level, e.g.). Should return false in case of error
// or abort request.
int (*put)(const VP8Io* io);
// or abort request. The actual size of the area to update is mb_w x mb_h
// in size, taking cropping into account.
VP8IoPutHook put;
// called just before starting to decode the blocks.
// Should returns 0 in case of error.
int (*setup)(VP8Io* io);
// Must return false in case of setup error, true otherwise. If false is
// returned, teardown() will NOT be called. But if the setup succeeded
// and true is returned, then teardown() will always be called afterward.
VP8IoSetupHook setup;
// called just after block decoding is finished (or when an error occurred).
void (*teardown)(const VP8Io* io);
// Called just after block decoding is finished (or when an error occurred
// during put()). Is NOT called if setup() failed.
VP8IoTeardownHook teardown;
// this is a recommendation for the user-side yuv->rgb converter. This flag
// is set when calling setup() hook and can be overwritten by it. It then
// can be taken into consideration during the put() method.
int fancy_upscaling;
int fancy_upsampling;
// Input buffer.
uint32_t data_size;
@@ -80,16 +90,36 @@ struct VP8Io {
// of more visible blocking. Note that output will also be non-compliant
// with the VP8 specifications.
int bypass_filtering;
// Cropping parameters.
int use_cropping;
int crop_left, crop_right, crop_top, crop_bottom;
// Scaling parameters.
int use_scaling;
int scaled_width, scaled_height;
// pointer to the alpha data (if present) corresponding to the rows
const uint8_t* a;
};
// Internal, version-checked, entry point
int VP8InitIoInternal(VP8Io* const, int);
WEBP_EXTERN(int) VP8InitIoInternal(VP8Io* const, int);
// Set the custom IO function pointers and user-data. The setter for IO hooks
// should be called before initiating incremental decoding. Returns true if
// WebPIDecoder object is successfully modified, false otherwise.
WEBP_EXTERN(int) WebPISetIOHooks(WebPIDecoder* const idec,
VP8IoPutHook put,
VP8IoSetupHook setup,
VP8IoTeardownHook teardown,
void* user_data);
// Main decoding object. This is an opaque structure.
typedef struct VP8Decoder VP8Decoder;
// Create a new decoder object.
VP8Decoder* VP8New(void);
WEBP_EXTERN(VP8Decoder*) VP8New(void);
// Must be called to make sure 'io' is initialized properly.
// Returns false in case of version mismatch. Upon such failure, no other
@@ -99,26 +129,26 @@ static inline int VP8InitIo(VP8Io* const io) {
}
// Start decoding a new picture. Returns true if ok.
int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
WEBP_EXTERN(int) VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
// Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
// Returns false in case of error.
int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
WEBP_EXTERN(int) VP8Decode(VP8Decoder* const dec, VP8Io* const io);
// Return current status of the decoder:
VP8StatusCode VP8Status(VP8Decoder* const dec);
WEBP_EXTERN(VP8StatusCode) VP8Status(VP8Decoder* const dec);
// return readable string corresponding to the last status.
const char* VP8StatusMessage(VP8Decoder* const dec);
WEBP_EXTERN(const char*) VP8StatusMessage(VP8Decoder* const dec);
// Resets the decoder in its initial state, reclaiming memory.
// Not a mandatory call between calls to VP8Decode().
void VP8Clear(VP8Decoder* const dec);
WEBP_EXTERN(void) VP8Clear(VP8Decoder* const dec);
// Destroy the decoder object.
void VP8Delete(VP8Decoder* const dec);
WEBP_EXTERN(void) VP8Delete(VP8Decoder* const dec);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -14,35 +14,38 @@
#include <stdlib.h>
#include "webp/types.h"
#include "./types.h"
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
#define WEBP_ENCODER_ABI_VERSION 0x0001
#define WEBP_ENCODER_ABI_VERSION 0x0002
// Return the encoder's version number, packed in hexadecimal using 8bits for
// each of major/minor/revision. E.g: v2.5.7 is 0x020507.
int WebPGetEncoderVersion(void);
WEBP_EXTERN(int) WebPGetEncoderVersion(void);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// One-stop-shop call! No questions asked:
// Returns the size of the compressed data (pointed to by *output), or 0 if
// an error occurred. The compressed data must be released by the caller
// using the call 'free(*output)'.
// Currently, alpha values are discarded.
size_t WebPEncodeRGB(const uint8_t* rgb, int width, int height, int stride,
float quality_factor, uint8_t** output);
size_t WebPEncodeBGR(const uint8_t* bgr, int width, int height, int stride,
float quality_factor, uint8_t** output);
size_t WebPEncodeRGBA(const uint8_t* rgba, int width, int height, int stride,
float quality_factor, uint8_t** output);
size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
float quality_factor, uint8_t** output);
WEBP_EXTERN(size_t) WebPEncodeRGB(const uint8_t* rgb,
int width, int height, int stride,
float quality_factor, uint8_t** output);
WEBP_EXTERN(size_t) WebPEncodeBGR(const uint8_t* bgr,
int width, int height, int stride,
float quality_factor, uint8_t** output);
WEBP_EXTERN(size_t) WebPEncodeRGBA(const uint8_t* rgba,
int width, int height, int stride,
float quality_factor, uint8_t** output);
WEBP_EXTERN(size_t) WebPEncodeBGRA(const uint8_t* bgra,
int width, int height, int stride,
float quality_factor, uint8_t** output);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Coding parameters
typedef struct {
@@ -66,6 +69,9 @@ typedef struct {
int preprocessing; // preprocessing filter (0=none, 1=segment-smooth)
int partitions; // log2(number of token partitions) in [0..3]
// Default is set to 0 for easier progressive decoding.
int partition_limit; // quality degradation allowed to fit the 512k limit on
// prediction modes coding (0=no degradation, 100=full)
int alpha_compression; // Algorithm for optimizing the alpha plane (0 = none)
} WebPConfig;
// Enumerate some predefined settings for WebPConfig, depending on the type
@@ -80,7 +86,8 @@ typedef enum {
} WebPPreset;
// Internal, version-checked, entry point
int WebPConfigInitInternal(WebPConfig* const, WebPPreset, float, int);
WEBP_EXTERN(int) WebPConfigInitInternal(
WebPConfig* const, WebPPreset, float, int);
// Should always be called, to initialize a fresh WebPConfig structure before
// modification. Returns 0 in case of version mismatch. WebPConfigInit() must
@@ -101,25 +108,28 @@ static inline int WebPConfigPreset(WebPConfig* const config,
}
// Returns 1 if all parameters are in valid range and the configuration is OK.
int WebPValidateConfig(const WebPConfig* const config);
WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* const config);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Input / Output
typedef struct WebPPicture WebPPicture; // main structure for I/O
// non-essential structure for storing auxilliary statistics
// non-essential structure for storing auxiliary statistics
typedef struct {
float PSNR[4]; // peak-signal-to-noise ratio for Y/U/V/All
int coded_size; // final size
int block_count[3]; // number of intra4/intra16/skipped macroblocks
int header_bytes[2]; // approximative number of bytes spent for header
int header_bytes[2]; // approximate number of bytes spent for header
// and mode-partition #0
int residual_bytes[3][4]; // approximative number of bytes spent for
int residual_bytes[3][4]; // approximate number of bytes spent for
// DC/AC/uv coefficients for each (0..3) segments.
int segment_size[4]; // number of macroblocks in each segments
int segment_quant[4]; // quantizer values for each segments
int segment_level[4]; // filtering strength for each segments [0..63]
int alpha_data_size; // size of the transparency data
int layer_data_size; // size of the enhancement layer data
} WebPAuxStats;
// Signature for output function. Should return 1 if writing was successful.
@@ -128,13 +138,46 @@ typedef struct {
typedef int (*WebPWriterFunction)(const uint8_t* data, size_t data_size,
const WebPPicture* const picture);
typedef enum {
// chroma sampling
WEBP_YUV420 = 0, // 4:2:0
WEBP_YUV422 = 1, // 4:2:2
WEBP_YUV444 = 2, // 4:4:4
WEBP_YUV400 = 3, // grayscale
WEBP_CSP_UV_MASK = 3, // bit-mask to get the UV sampling factors
// alpha channel variants
WEBP_YUV420A = 4,
WEBP_YUV422A = 5,
WEBP_YUV444A = 6,
WEBP_YUV400A = 7, // grayscale + alpha
WEBP_CSP_ALPHA_BIT = 4 // bit that is set if alpha is present
} WebPEncCSP;
// Encoding error conditions.
typedef enum {
VP8_ENC_OK = 0,
VP8_ENC_ERROR_OUT_OF_MEMORY, // memory error allocating objects
VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY, // memory error while flushing bits
VP8_ENC_ERROR_NULL_PARAMETER, // a pointer parameter is NULL
VP8_ENC_ERROR_INVALID_CONFIGURATION, // configuration is invalid
VP8_ENC_ERROR_BAD_DIMENSION, // picture has invalid width/height
VP8_ENC_ERROR_PARTITION0_OVERFLOW, // partition is bigger than 512k
VP8_ENC_ERROR_PARTITION_OVERFLOW, // partition is bigger than 16M
VP8_ENC_ERROR_BAD_WRITE, // error while flushing bytes
VP8_ENC_ERROR_FILE_TOO_BIG, // file is bigger than 4G
} WebPEncodingError;
// maximum width/height allowed (inclusive), in pixels
#define WEBP_MAX_DIMENSION 16383
struct WebPPicture {
// input
int colorspace; // colorspace: should be 0 for now (=Y'CbCr).
int width, height; // dimensions.
WebPEncCSP colorspace; // colorspace: should be YUV420 for now (=Y'CbCr).
int width, height; // dimensions (less or equal to WEBP_MAX_DIMENSION)
uint8_t *y, *u, *v; // pointers to luma/chroma planes.
int y_stride, uv_stride; // luma/chroma strides.
uint8_t *a; // pointer to the alpha plane (unused for now).
uint8_t *a; // pointer to the alpha plane
int a_stride; // stride of the alpha plane
// output
WebPWriterFunction writer; // can be NULL
@@ -152,10 +195,16 @@ struct WebPPicture {
// where to store statistics, if not NULL:
WebPAuxStats* stats;
// original samples (for non-YUV420 modes)
uint8_t *u0, *v0;
int uv0_stride;
WebPEncodingError error_code; // error code in case of problem.
};
// Internal, version-checked, entry point
int WebPPictureInitInternal(WebPPicture* const, int);
WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture* const, int);
// Should always be called, to initialize the structure. Returns 0 in case of
// version mismatch. WebPPictureInit() must have succeeded before using the
@@ -164,54 +213,64 @@ static inline int WebPPictureInit(WebPPicture* const picture) {
return WebPPictureInitInternal(picture, WEBP_ENCODER_ABI_VERSION);
}
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// WebPPicture utils
// Convenience allocation / deallocation based on picture->width/height:
// Allocate y/u/v buffers as per width/height specification.
// Allocate y/u/v buffers as per colorspace/width/height specification.
// Note! This function will free the previous buffer if needed.
// Returns 0 in case of memory error.
int WebPPictureAlloc(WebPPicture* const picture);
WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* const picture);
// Release memory allocated by WebPPictureAlloc() or WebPPictureImport*()
// Note that this function does _not_ free the memory pointed to by 'picture'.
void WebPPictureFree(WebPPicture* const picture);
WEBP_EXTERN(void) WebPPictureFree(WebPPicture* const picture);
// Copy the pixels of *src into *dst, using WebPPictureAlloc.
// Returns 0 in case of memory allocation error.
int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst);
WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* const src,
WebPPicture* const dst);
// self-crops a picture to the rectangle defined by top/left/width/height.
// Returns 0 in case of memory allocation error, or if the rectangle is
// outside of the source picture.
int WebPPictureCrop(WebPPicture* const picture,
int left, int top, int width, int height);
WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* const picture,
int left, int top, int width, int height);
// Colorspace conversion function. Previous buffer will be free'd, if any.
// Rescale a picture to new dimension width x height.
// Now gamma correction is applied.
// Returns false in case of error (invalid parameter or insufficient memory).
WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* const pic,
int width, int height);
// Colorspace conversion function to import RGB samples.
// Previous buffer will be free'd, if any.
// *rgb buffer should have a size of at least height * rgb_stride.
// Returns 0 in case of memory error.
int WebPPictureImportRGB(WebPPicture* const picture,
const uint8_t* const rgb, int rgb_stride);
// Same, but for RGBA buffer. Alpha information is ignored.
int WebPPictureImportRGBA(WebPPicture* const picture,
const uint8_t* const rgba, int rgba_stride);
WEBP_EXTERN(int) WebPPictureImportRGB(
WebPPicture* const picture, const uint8_t* const rgb, int rgb_stride);
// Same, but for RGBA buffer
WEBP_EXTERN(int) WebPPictureImportRGBA(
WebPPicture* const picture, const uint8_t* const rgba, int rgba_stride);
// Variant of the above, but taking BGR input:
int WebPPictureImportBGR(WebPPicture* const picture,
const uint8_t* const bgr, int bgr_stride);
int WebPPictureImportBGRA(WebPPicture* const picture,
const uint8_t* const bgra, int bgra_stride);
// Variant of the above, but taking BGR(A) input:
WEBP_EXTERN(int) WebPPictureImportBGR(
WebPPicture* const picture, const uint8_t* const bgr, int bgr_stride);
WEBP_EXTERN(int) WebPPictureImportBGRA(
WebPPicture* const picture, const uint8_t* const bgra, int bgra_stride);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
// Main call
// Main encoding call, after config and picture have been initialiazed.
// 'picture' must be less than 16384x16384 in dimension, and the 'config' object
// must be a valid one.
// Main encoding call, after config and picture have been initialized.
// 'picture' must be less than 16384x16384 in dimension (cf WEBP_MAX_DIMENSION),
// and the 'config' object must be a valid one.
// Returns false in case of error, true otherwise.
int WebPEncode(const WebPConfig* const config, WebPPicture* const picture);
// In case of error, picture->error_code is updated accordingly.
WEBP_EXTERN(int) WebPEncode(
const WebPConfig* const config, WebPPicture* const picture);
//-----------------------------------------------------------------------------
//------------------------------------------------------------------------------
#if defined(__cplusplus) || defined(c_plusplus)
} // extern "C"

View File

@@ -29,4 +29,10 @@ typedef long long int int64_t;
#define inline __forceinline
#endif /* _MSC_VER */
#ifndef WEBP_EXTERN
// This explicitly marks library functions and allows for changing the
// signature for e.g., Windows DLL builds.
#define WEBP_EXTERN(type) extern type
#endif /* WEBP_EXTERN */
#endif /* WEBP_WEBP_TYPES_H_ */

39
swig/README Normal file
View File

@@ -0,0 +1,39 @@
Building:
=========
JNI SWIG bindings:
------------------
$ gcc -shared -fPIC -fno-strict-aliasing -O2 \
-I/path/to/your/jdk/includes \
libwebp_java_wrap.c \
-lwebp \
-o libwebp_jni.so
-------------------------------------- BEGIN PSEUDO EXAMPLE
import com.google.webp.libwebp;
import java.lang.reflect.Method;
public class libwebp_jni_example {
static {
System.loadLibrary("webp_jni");
}
/**
* usage: java -cp libwebp.jar:. libwebp_jni_example
*/
public static void main(String argv[]) {
final int version = libwebp.WebPGetDecoderVersion();
System.out.println("libwebp version: " + Integer.toHexString(version));
System.out.println("libwebp methods:");
final Method[] libwebpMethods = libwebp.class.getDeclaredMethods();
for (int i = 0; i < libwebpMethods.length; i++) {
System.out.println(libwebpMethods[i]);
}
}
}
-------------------------------------- END PSEUDO EXAMPLE
$ javac -cp libwebp.jar libwebp_jni_example.java
$ java -Djava.library.path=. -cp libwebp.jar:. libwebp_jni_example

232
swig/libwebp.i Normal file
View File

@@ -0,0 +1,232 @@
// Copyright 2011 Google Inc.
//
// This code is licensed under the same terms as WebM:
// Software License Agreement: http://www.webmproject.org/license/software/
// Additional IP Rights Grant: http://www.webmproject.org/license/additional/
// -----------------------------------------------------------------------------
//
// libwebp swig interface definition
//
// Author: James Zern (jzern@google.com)
//
// For java bindings compile with:
// $ mkdir -p java/com/google/webp
// $ swig -ignoremissing -I../src \
// -java \
// -package com.google.webp \
// -outdir java/com/google/webp \
// -o libwebp_java_wrap.c libwebp.i
%module libwebp
%include "constraints.i"
%include "typemaps.i"
#ifdef SWIGJAVA
%include "arrays_java.i";
%include "enums.swg" /*NB: requires JDK-1.5+
See: http://www.swig.org/Doc1.3/Java.html#enumerations */
// map uint8_t* such that a byte[] is used
// this will generate a few spurious warnings in the wrapper code
%apply signed char[] { uint8_t * }
#endif /* SWIGJAVA */
//------------------------------------------------------------------------------
// Decoder specific
%apply int *OUTPUT { int *width, int *height }
%apply int { uint32_t data_size }
%apply Number NONNEGATIVE { uint32_t data_size }
// free the buffer returned by these functions after copying into
// the native type
%newobject WebPDecodeRGB;
%newobject WebPDecodeRGBA;
%newobject WebPDecodeARGB;
%newobject WebPDecodeBGR;
%newobject WebPDecodeBGRA;
%typemap(newfree) uint8_t* "free($1);"
int WebPGetDecoderVersion(void);
int WebPGetInfo(const uint8_t* data, uint32_t data_size,
int *width, int *height);
uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
int *width, int *height);
uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
int *width, int *height);
uint8_t* WebPDecodeARGB(const uint8_t* data, uint32_t data_size,
int* width, int* height);
uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
int *width, int *height);
uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
int *width, int *height);
//------------------------------------------------------------------------------
// Encoder specific
int WebPGetEncoderVersion(void);
//------------------------------------------------------------------------------
// Wrapper code additions
%{
#include "webp/decode.h"
#include "webp/encode.h"
#define FillMeInAsSizeCannotBeDeterminedAutomatically \
(result ? returned_buffer_size(__FUNCTION__, arg3, arg4) : 0)
static jint returned_buffer_size(
const char *function, int *width, int *height) {
static const struct sizemap {
const char *function;
int size_multiplier;
} size_map[] = {
{ "Java_com_google_webp_libwebpJNI_WebPDecodeRGB", 3 },
{ "Java_com_google_webp_libwebpJNI_WebPDecodeRGBA", 4 },
{ "Java_com_google_webp_libwebpJNI_WebPDecodeARGB", 4 },
{ "Java_com_google_webp_libwebpJNI_WebPDecodeBGR", 3 },
{ "Java_com_google_webp_libwebpJNI_WebPDecodeBGRA", 4 },
{ "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeRGB", 1 },
{ "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeBGR", 1 },
{ "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeRGBA", 1 },
{ "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeBGRA", 1 },
{ NULL, 0 }
};
const struct sizemap *p;
jint size = -1;
for (p = size_map; p->function; p++) {
if (!strcmp(function, p->function)) {
size = *width * *height * p->size_multiplier;
break;
}
}
return size;
}
typedef size_t (*WebPEncodeFunction)(const uint8_t* rgb,
int width, int height, int stride,
float quality_factor, uint8_t** output);
static uint8_t* encode(const uint8_t* rgb,
int width, int height, int stride,
float quality_factor,
WebPEncodeFunction encfn,
int* output_size, int* unused) {
uint8_t *output = NULL;
const size_t image_size =
encfn(rgb, width, height, stride, quality_factor, &output);
// the values of following two will be interpreted by returned_buffer_size()
// as 'width' and 'height' in the size calculation.
*output_size = image_size;
*unused = 1;
return image_size ? output : NULL;
}
%}
//------------------------------------------------------------------------------
// libwebp/encode wrapper functions
%apply int *INPUT { int *unused1, int *unused2 }
%apply int *OUTPUT { int *output_size }
// free the buffer returned by these functions after copying into
// the native type
%newobject wrap_WebPEncodeRGB;
%newobject wrap_WebPEncodeBGR;
%newobject wrap_WebPEncodeRGBA;
%newobject wrap_WebPEncodeBGRA;
#ifdef SWIGJAVA
// There's no reason to call these directly
%javamethodmodifiers wrap_WebPEncodeRGB "private";
%javamethodmodifiers wrap_WebPEncodeBGR "private";
%javamethodmodifiers wrap_WebPEncodeRGBA "private";
%javamethodmodifiers wrap_WebPEncodeBGRA "private";
#endif /* SWIGJAVA */
%inline %{
// Changes the return type of WebPEncode* to more closely match Decode*.
// This also makes it easier to wrap the output buffer in a native type rather
// than dealing with the return pointer.
// The additional parameters are to allow reuse of returned_buffer_size(),
// unused2 and output_size will be used in this case.
static uint8_t* wrap_WebPEncodeRGB(
const uint8_t* rgb, int* unused1, int* unused2, int* output_size,
int width, int height, int stride, float quality_factor) {
return encode(rgb, width, height, stride, quality_factor,
WebPEncodeRGB, output_size, unused2);
}
static uint8_t* wrap_WebPEncodeBGR(
const uint8_t* bgr, int* unused1, int* unused2, int* output_size,
int width, int height, int stride, float quality_factor) {
return encode(bgr, width, height, stride, quality_factor,
WebPEncodeBGR, output_size, unused2);
}
static uint8_t* wrap_WebPEncodeRGBA(
const uint8_t* rgba, int* unused1, int* unused2, int* output_size,
int width, int height, int stride, float quality_factor) {
return encode(rgba, width, height, stride, quality_factor,
WebPEncodeRGBA, output_size, unused2);
}
static uint8_t* wrap_WebPEncodeBGRA(
const uint8_t* bgra, int* unused1, int* unused2, int* output_size,
int width, int height, int stride, float quality_factor) {
return encode(bgra, width, height, stride, quality_factor,
WebPEncodeBGRA, output_size, unused2);
}
%}
//------------------------------------------------------------------------------
// Language specific
#ifdef SWIGJAVA
%{
/* Work around broken gcj jni.h */
#ifdef __GCJ_JNI_H__
# undef JNIEXPORT
# define JNIEXPORT
# undef JNICALL
# define JNICALL
#endif
%}
%pragma(java) modulecode=%{
private static final int UNUSED = 1;
private static int outputSize[] = { 0 };
public static byte[] WebPEncodeRGB(byte[] rgb,
int width, int height, int stride,
float quality_factor) {
return wrap_WebPEncodeRGB(
rgb, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
}
public static byte[] WebPEncodeBGR(byte[] bgr,
int width, int height, int stride,
float quality_factor) {
return wrap_WebPEncodeBGR(
bgr, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
}
public static byte[] WebPEncodeRGBA(byte[] rgba,
int width, int height, int stride,
float quality_factor) {
return wrap_WebPEncodeRGBA(
rgba, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
}
public static byte[] WebPEncodeBGRA(byte[] bgra,
int width, int height, int stride,
float quality_factor) {
return wrap_WebPEncodeBGRA(
bgra, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
}
%}
#endif /* SWIGJAVA */

BIN
swig/libwebp.jar Normal file

Binary file not shown.

1518
swig/libwebp_java_wrap.c Normal file

File diff suppressed because it is too large Load Diff