Compare commits

...

144 Commits

Author SHA1 Message Date
Peter de Rivaz
fd05fb0c21 Corrected optimization of 8x8 DCT code
The 8x8 DCT uses a fast version whenever possible.
There was a mistake in the checking code which
meant sometimes the fast version was used when it
was not safe to do so.

Change-Id: I154c84c9e2d836764768a11082947ca30f4b5ab7
2014-12-11 15:54:23 +00:00
Debargha Mukherjee
39da55a49f Merge "Added tests for high bitdepth variance sse2 functions" into highbitdepth 2014-11-10 14:42:26 -08:00
Peter de Rivaz
6d741e4d76 Added tests for high bitdepth variance sse2 functions
Change-Id: I72735e2e07464a0f7e44968fb14a195c84a58992
2014-11-10 20:42:24 +00:00
Peter de Rivaz
db7192e0b0 Added highbitdepth sse2 acceleration for quantize and block error
Change-Id: Idef18f90b111a0d0c9546543d3347e551908fd78
2014-11-10 10:47:39 -08:00
Peter de Rivaz
08d2f54800 Fixed idct16x16_10 highbitdepth transform
In the case when there are only non-zero coefficients
in the first 4x4 block a special routine is called.
The highbitdepth optimized version of this routine
examined the wrong positions when deciding whether
to call an assembler or C inverse transform.

Change-Id: I62da663ca11775dadb66e402e42f4a1cb1927893
2014-11-10 16:17:49 +00:00
Deb Mukherjee
a1b726117f Iadst transforms to use internal low precision
Change-Id: I266777d40c300bc53b45b205144520b85b0d6e58
2014-11-06 13:57:04 -08:00
Peter de Rivaz
005d80cd05 Added high bitdepth sse2 transform functions
Change-Id: If359f0e9a71bca9c2ba685a87a355873536bb282
2014-11-06 11:50:47 -08:00
Peter de Rivaz
d7422b2b1e Added sse2 acceleration for highbitdepth variance
Change-Id: I446bdf3a405e4e9d2aa633d6281d66ea0cdfd79f
2014-11-04 10:06:06 -08:00
Peter de Rivaz
454342d4e7 Refactored idct routines and headers
This change is made in preparation for a
subsequent patch which adds acceleration
for the highbitdepth transform functions.

The highbitdepth transform functions attempt
to use 16/32bit sse instructions where possible,
but fallback to using the C implementations if
potential overflow is detected.  For this reason
the dct routines are made global so they can be
called from the acceleration functions in the
subsequent patch.

Change-Id: Ia921f191bf6936ccba4f13e8461624b120c1f665
2014-10-24 08:37:39 +01:00
Debargha Mukherjee
cda2ad0121 Merge "Fixed calling of highbd transform." into highbitdepth 2014-10-22 12:57:43 -07:00
Debargha Mukherjee
6b378b8868 Merge "Tidy up of highbitdepth loopfilter and convolution" into highbitdepth 2014-10-22 09:58:29 -07:00
Peter de Rivaz
123f29d1d7 Tidy up of highbitdepth loopfilter and convolution
Change-Id: I65531cc55d3d6949e164e2e26f92ee44a1921f7e
2014-10-22 09:43:03 +01:00
Peter de Rivaz
4230c2306c Fixed calling of highbd transform.
This patch does not change behaviour because
vp9_fwht4x4 is identical to vp9_highbd_fwht4x4,
but it becomes important when accelerations are made
to vp9_highbd_fwht4x4 in a later patch.

Change-Id: I2b790316cdd498727c4951a9e591edb291de3ac8
2014-10-22 09:26:39 +01:00
Alex Converse
5b76018057 Merge "Added highbitdepth sse2 SAD acceleration and tests" into highbitdepth 2014-10-20 09:55:41 -07:00
Peter de Rivaz
b1a6f6b9cb Added highbitdepth sse2 SAD acceleration and tests
Change-Id: I9f09e404e3136951e5cc15bf40b915c1fe10b620
2014-10-20 09:51:01 +01:00
Debargha Mukherjee
a92f987a6b Merge "Add highbitdepth function for vp9_avg_8x8" into highbitdepth 2014-10-16 14:39:14 -07:00
Peter de Rivaz
1bf87dc353 Add highbitdepth function for vp9_avg_8x8
Change-Id: I6903e4e4cb57d90590725c8a1c64c23da7ae65e8
2014-10-16 14:36:07 +01:00
Deb Mukherjee
b84bf3323b Fix in bit-shift operation for highbitdepth decode
Fixes a bug introduced in a previous refactoring patch.

Change-Id: I243e74637cfd7a997c7a1fef03b06c290dd0dee6
2014-10-15 10:28:30 -07:00
Deb Mukherjee
563aeba901 Merge 'origin/master' into highbitdepth
Conflicts:
	examples/vp9_spatial_svc_encoder.c
	examples/vpx_temporal_svc_encoder.c
	test/convolve_test.cc
	test/dct16x16_test.cc
	test/dct32x32_test.cc
	test/fdct4x4_test.cc
	test/fdct8x8_test.cc
	test/lpf_8_test.cc
	test/partial_idct_test.cc
	test/test.mk
	test/vp9_intrapred_test.cc
	tools_common.c
	vp8/vp8_cx_iface.c
	vp9/common/vp9_alloccommon.c
	vp9/common/vp9_common.h
	vp9/common/vp9_convolve.c
	vp9/common/vp9_convolve.h
	vp9/common/vp9_entropy.c
	vp9/common/vp9_entropy.h
	vp9/common/vp9_idct.c
	vp9/common/vp9_idct.h
	vp9/common/vp9_loopfilter.c
	vp9/common/vp9_loopfilter_filters.c
	vp9/common/vp9_onyxc_int.h
	vp9/common/vp9_postproc.c
	vp9/common/vp9_quant_common.c
	vp9/common/vp9_quant_common.h
	vp9/common/vp9_reconinter.c
	vp9/common/vp9_reconinter.h
	vp9/common/vp9_reconintra.c
	vp9/common/vp9_rtcd_defs.pl
	vp9/common/vp9_scale.c
	vp9/common/vp9_scale.h
	vp9/common/x86/vp9_asm_stubs.c
	vp9/common/x86/vp9_high_intrapred_sse2.asm
	vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
	vp9/common/x86/vp9_high_subpixel_8t_sse2.asm
	vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm
	vp9/decoder/vp9_decodeframe.c
	vp9/decoder/vp9_detokenize.c
	vp9/encoder/vp9_aq_complexity.c
	vp9/encoder/vp9_aq_cyclicrefresh.c
	vp9/encoder/vp9_aq_variance.c
	vp9/encoder/vp9_bitstream.c
	vp9/encoder/vp9_block.h
	vp9/encoder/vp9_context_tree.h
	vp9/encoder/vp9_dct.c
	vp9/encoder/vp9_encodeframe.c
	vp9/encoder/vp9_encodemb.c
	vp9/encoder/vp9_encoder.c
	vp9/encoder/vp9_encoder.h
	vp9/encoder/vp9_extend.c
	vp9/encoder/vp9_firstpass.c
	vp9/encoder/vp9_lookahead.c
	vp9/encoder/vp9_lookahead.h
	vp9/encoder/vp9_mcomp.c
	vp9/encoder/vp9_picklpf.c
	vp9/encoder/vp9_pickmode.c
	vp9/encoder/vp9_quantize.c
	vp9/encoder/vp9_quantize.h
	vp9/encoder/vp9_ratectrl.c
	vp9/encoder/vp9_rd.c
	vp9/encoder/vp9_rdopt.c
	vp9/encoder/vp9_resize.c
	vp9/encoder/vp9_resize.h
	vp9/encoder/vp9_sad.c
	vp9/encoder/vp9_ssim.c
	vp9/encoder/vp9_ssim.h
	vp9/encoder/vp9_temporal_filter.c
	vp9/encoder/vp9_tokenize.c
	vp9/encoder/vp9_tokenize.h
	vp9/encoder/vp9_variance.c
	vp9/encoder/vp9_variance.h
	vp9/vp9_common.mk
	vp9/vp9_cx_iface.c
	vp9/vp9_dx_iface.c
	vp9/vp9_iface_common.h
	vpx/src/svc_encodeframe.c
	vpx/src/vpx_image.c
	vpx/vp8dx.h
	vpx/vpx_codec.h
	vpx/vpx_encoder.h
	vpx/vpx_image.h
	vpx_mem/vpx_mem.c
	vpx_mem/vpx_mem.h
	vpx_scale/generic/yv12config.c
	vpx_scale/generic/yv12extend.c
	vpx_scale/yv12config.h
	vpxdec.c
	vpxenc.c

Change-Id: I699c833f7da96569d2581af7e045ce523bf72d3b
2014-10-14 10:08:15 -07:00
Debargha Mukherjee
93657ee6ec Merge "Add bit_depth to internal image structure" into highbitdepth 2014-09-22 10:03:09 -07:00
Peter de Rivaz
3df0e78eae Add bit_depth to internal image structure
This change is made to allow show_existing_frame
to display an image of a different bit depth to
the frames currently being decoded.

Change-Id: Ia150bde2a4ed56fe3892a53fd7b5b9f5182ad59f
2014-09-22 16:02:30 +01:00
Peter de Rivaz
e27a93fe25 Fix for when profile switches back to 8bit
Change-Id: I820027d8aebef67cb8380c47475ee59e19d7a97b
2014-09-22 10:02:55 +01:00
Deb Mukherjee
159247f30c Merge "Add missing sse code for some functions" into highbitdepth 2014-09-16 16:54:25 -07:00
Deb Mukherjee
06cfc9f3e1 Add missing sse code for some functions
Change-Id: I9830217108a74f546e707ab82a15e278319f4360
2014-09-16 11:37:37 -07:00
Deb Mukherjee
85dba225a8 Merge "Fix 2x scaling bug" into highbitdepth 2014-09-16 09:39:36 -07:00
Deb Mukherjee
53ef87e0c7 Fix asserts in high_convolve
Change-Id: I89fb190ef22d8edf7622dea5ef1c0e47faa95d23
2014-09-15 11:33:27 -07:00
Deb Mukherjee
09faceb706 Fix 2x scaling bug
Reported by Alexander Voronov

Change-Id: I463ba323cc9f4e345b7e6f759565725737793086
2014-09-15 09:48:35 -07:00
Deb Mukherjee
75c8fc2412 Deprecate vpx_bit_depth_to_bps function
Function not needed.

Change-Id: Icaad4cbd218ddd8e3720a7d9aad6b69aeffa369b
2014-09-12 16:40:26 -07:00
Deb Mukherjee
635bb7aed4 Change macros to be compatible with master branch
Changes image fmt and codec capability macro names.

Change-Id: Ic07fbd8de307cb1bfcf9ed4b4bc3feaa5767c53b
2014-09-12 11:47:39 -07:00
Deb Mukherjee
dd0a5ecd2c Minor Refactor of variance functions
Change-Id: If27a938cefeb6e8439f897900e17e2e85a2239d1
2014-09-11 23:10:49 -07:00
Deb Mukherjee
3d170a834a change data type in vp9_idct.c
Change-Id: I2efb51e0e94fd4d42334d86c7dcee08d1a1cc672
2014-09-11 20:14:35 -07:00
Debargha Mukherjee
1db0b0ff1b Merge "Reinitialize dequantizer when switching from 10/12 bit to 8 bit." into highbitdepth 2014-09-11 20:05:33 -07:00
Alexander Voronov
902529c595 Reinitialize dequantizer when switching from 10/12 bit to 8 bit.
Change-Id: Id294cf8d314a3f8aaf4ca2a6b3da052cc898a78c
2014-09-11 21:00:51 +04:00
Alexander Voronov
7dc3cdba0c Make --output-bit-depth option working with variable resolution.
Buffer for bit-depth conversion was allocated only on the first frame.
The next frame with resolution different from the first one led to
decoding error. With this changes decoder performs buffer reallocation
in such case.

Change-Id: I3a701ca8df53a60246354876856624e70efe81aa
2014-09-10 16:30:30 +04:00
Alexander Voronov
f9b4008020 Fix chroma plane size in bit-depth conversion.
Change-Id: Ie75ef8f934d02b57b543ca308ae304458d3962a8
2014-09-10 15:03:50 +04:00
Alexander Voronov
64fca22b4d Fix Visual Studio build.
Change-Id: I6fc8cf7af19b884366ec45e0b8f1e500015b38b8
2014-09-09 11:35:12 +04:00
Peter de Rivaz
b94a475d7b In profile3 we need 2 bytes to show a previous frame
Change-Id: I208632dd94dfc45ab78312e26fee569270ce0ba8
2014-09-01 16:09:07 +01:00
Deb Mukherjee
325fe4a430 Merge "Replace CONVERT_TO_SHORTPTR/BYTEPTR" into highbitdepth 2014-08-29 12:24:07 -07:00
Deb Mukherjee
484a7f31a6 Replace CONVERT_TO_SHORTPTR/BYTEPTR
Uses uintptr_t instead of uint32_t or uint64_t

Change-Id: Id21204969d2723e6c64c29a5d39ec85bfbba9ea8
2014-08-29 10:05:50 -07:00
Deb Mukherjee
059a721d92 Change vpx bit depth enum type to be more natural
Also moves bit_depth_to_bps function to the vpx level.

Change-Id: I7c24f80b2b5bd2fcc6552b61aacee4c7954cc54b
2014-08-28 23:57:57 -07:00
Deb Mukherjee
23b5c58174 Set scaled image format correctly
Change-Id: Ic4ced4208375ca31f8adb73a5ee9ddd7da50dfac
2014-08-22 23:33:29 -07:00
Deb Mukherjee
02118dcb3b Makes high_quant enabled by default
Wtth this change, when --enable-vp9-high is used, high quantization is
used by default, unless it is turned off with --disable-high-quant.

Change-Id: I8a127428181d5dd726a4f3f225ea80f3215201ba
2014-08-19 14:51:00 -07:00
James Hutchinson
3489c19d2b Merge branch 'master' into highbitdepth
Conflicts:
	configure
	test/convolve_test.cc
	test/dct16x16_test.cc
	test/dct32x32_test.cc
	test/fdct4x4_test.cc
	test/fdct8x8_test.cc
	test/partial_idct_test.cc
	third_party/libyuv/README.libvpx
	vp9/common/vp9_enums.h
	vp9/common/vp9_idct.h
	vp9/common/vp9_rtcd_defs.pl
	vp9/decoder/vp9_decodeframe.c
	vp9/encoder/vp9_bitstream.c
	vp9/encoder/vp9_encodeframe.c
	vp9/encoder/vp9_encoder.c
	vp9/encoder/vp9_encoder.h
	vp9/encoder/vp9_extend.c
	vp9/encoder/vp9_quantize.c
	vp9/encoder/vp9_rd.c
	vp9/encoder/vp9_rdopt.c
	vp9/vp9_cx_iface.c
	vp9/vp9_dx_iface.c
	vp9/vp9_iface_common.h
	vpx/vp8cx.h
	vpx_scale/generic/yv12config.c
	vpxdec.c
	vpxenc.c

Change-Id: If4104c5a7cd0a29dd0bed7c3804837ba40ba7e0c
2014-08-19 10:58:02 -07:00
Deb Mukherjee
9595633751 Merge "Hdr change for profiles > 1 for intra-only frames" into highbitdepth 2014-08-07 11:19:55 -07:00
Deb Mukherjee
cea11ebb1e Hdr change for profiles > 1 for intra-only frames
Adds bitdepth, color sampling ind color space information to header
for intra only frames in profile > 0.
Also enforces profile 1 and 3 exclusive usage for non 420 color
sampling.

Change-Id: I92b0630d5193fdbc6e71fa909d684521c12b5d99
2014-08-07 09:55:19 -07:00
Yaowu Xu
60ee54352e fix build errors in vs2012
Macro expansion in VS2012 does not work with existing code, change in
this commit help VS2012 to properly expand the macros and avoid build
errors for test_libvpx project.

Change-Id: I642921462ee4869ea07f90795d7de0f6eaf7655b
2014-08-06 14:47:50 -07:00
Debargha Mukherjee
4b11ea5e32 Merge "Merged vp9_high and high_transforms options" into highbitdepth 2014-08-04 11:47:05 -07:00
Peter de Rivaz
27dc02de95 Remove warnings about uninitialized variables
Change-Id: I25c8b393239e9a5ba5a0f5a5c5897bb38915f33e
2014-07-29 16:21:43 +01:00
Peter de Rivaz
b7327fd3ea Merged vp9_high and high_transforms options
Using --enable-vp9-high now automatically uses
the high precision transform code.

(Before you needed to specify both
--enable-vp9-high and --enable-high-transforms)

Change-Id: I742d5b82601bc38eb81c95d7ecd3f78b9ff0df57
2014-07-29 15:59:06 +01:00
Peter de Rivaz
6791a9b1d5 Added high bitdepth to SVC example
Change-Id: I24498bace25b01f796f530699b58aa1e460f5ebc
2014-07-22 10:59:59 +01:00
Peter de Rivaz
3edc408011 Added high bitdepth support to temporal example
Change-Id: I80f802ecf82ecf18952f025643a56e1986def887
2014-07-22 10:53:50 +01:00
James Hutchinson
41c8641b6b Merge branch 'master' into highbitdepth
Conflicts:
    configure
    test/convolve_test.cc
    test/dct16x16_test.cc
    test/dct32x32_test.cc
    test/fdct4x4_test.cc
    test/fdct8x8_test.cc
    vp9/common/vp9_alloccommon.c
    vp9/common/vp9_entropy.c
    vp9/common/vp9_enums.h
    vp9/common/vp9_quant_common.c
    vp9/common/vp9_quant_common.h
    vp9/common/vp9_rtcd_defs.pl
    vp9/common/vp9_scale.c
    vp9/decoder/vp9_decodeframe.c
    vp9/decoder/vp9_decodeframe.h
    vp9/decoder/vp9_detokenize.c
    vp9/encoder/vp9_aq_complexity.c
    vp9/encoder/vp9_bitstream.c
    vp9/encoder/vp9_encodeframe.c
    vp9/encoder/vp9_encodemb.c
    vp9/encoder/vp9_encoder.c
    vp9/encoder/vp9_firstpass.c
    vp9/encoder/vp9_mcomp.c
    vp9/encoder/vp9_pickmode.c
    vp9/encoder/vp9_quantize.c
    vp9/encoder/vp9_ratectrl.c
    vp9/encoder/vp9_rd.c
    vp9/encoder/vp9_rdopt.c
    vp9/encoder/vp9_temporal_filter.c
    vp9/encoder/vp9_tokenize.c
    vp9/vp9_cx_iface.c
    vp9/vp9_dx_iface.c
    vpx/vpx_codec.h
    vpx/vpx_image.h
    vpx_scale/generic/yv12config.c
    vpxdec.c
    vpxenc.c
    vpxenc.h
    y4menc.c
    y4minput.c

Change-Id: I53b19ea1d9818a4440481920065d70164348d02e
2014-07-18 14:58:26 -07:00
Deb Mukherjee
0a904797f6 Merge "Support for raw yuv input in 422/444 sampling" into highbitdepth 2014-07-16 10:59:14 -07:00
Deb Mukherjee
cc3e6d68df Support for raw yuv input in 422/444 sampling
Adds options --i422 and --i444 which along with --input-bit-depth
will allow raw video input in 422 and 444 sampling at the given
bit-depth. For the decoder, the new option --rawvideo allows
decoding to raw yuv at the color sampling of the decoded bit-stream.

Change-Id: I5b3979be66c0dd2672391400850c97260cc8e1e8
2014-07-16 09:21:16 -07:00
Deb Mukherjee
aba709a5ec Change extra bit prob tables for HBD
Improves coding performance a little.

Change-Id: I9cb3279cec201274ecaa972f270156677e1bdf98
2014-07-16 03:53:39 -07:00
Deb Mukherjee
b0bcd57d12 Generalize read_yuv_frame
Change-Id: I52288ff1a01e6b68fc7eb85425e750b909e56d0c
2014-07-12 01:37:51 -07:00
Deb Mukherjee
78df9ead21 Fix for raw yuv input
Sets fmt correctly for raw yuv inputs, and reads frame
correctly.

Change-Id: I05405b2265545d5c6d1cf7b086db9548cbe5d8ba
2014-07-12 00:19:57 -07:00
Deb Mukherjee
cba05a9ecb Merge "Tokenization separated for 10 and 12 bit" into highbitdepth 2014-07-11 10:50:17 -07:00
Deb Mukherjee
87d40bbafc Tokenization separated for 10 and 12 bit
Enables tuning the probabilities separately, also
reduces the number of bool decodes for 10-bit cat 6
tokens.

Change-Id: Ib237ad07983f43048ba47245efe8fa53c0d25819
2014-07-11 07:00:14 -07:00
Deb Mukherjee
da82f64342 Set default bit_depth to 8 for vp8
Also adds some cleanups

Change-Id: I86af7590397e5a281b3808a36031642c1705a17a
2014-07-11 06:22:43 -07:00
Peter de Rivaz
02c3221593 Allow more combinations for output bitdepth
Change-Id: I1e5de85320d825ffdf90be53720e581f72a5f701
2014-07-10 11:50:25 +01:00
Deb Mukherjee
139ac07ae7 Changes quantization tables to improve quality
Changes quantization tables so that the scale factor diff against
8-bt version is 1 at the low q end and increases to 4 (16) at the
high q end for 10 (12) bit versions, using a logarithmic like
function.

This change improves the quality gap aginst the no-high-quant version,
for both 10-bit and 12-bit. 10-bit is virtually similar in performance
to the no-high-quant version, while the gap for 12-bit is reduced to
the 0.3% range.

Change-Id: I08cc329a77aeb19cc232f8e4458f188c6ae65363
2014-07-09 16:20:15 -07:00
Debargha Mukherjee
209885c785 Merge "Added SSE2 high bitdepth loopfilter" into highbitdepth 2014-07-08 09:49:47 -07:00
Debargha Mukherjee
000ea5e446 Merge "Fixes for highbitdepth on 32bit x86" into highbitdepth 2014-07-08 09:34:20 -07:00
Peter de Rivaz
c62b82d7c9 Added SSE2 high bitdepth loopfilter
Change-Id: I4e5ecaaf956d30bd2e5301d57f02e277100ceb7f
2014-07-08 16:49:59 +01:00
Peter de Rivaz
7e802a5bba Fixes for highbitdepth on 32bit x86
Some of the assembler routines only work for 64bit
architectures.  This patch makes such routines  fall
back to using C implementations.

Change-Id: Ia1e59d9ce5856eca0d56ab59fbc9436fa2838745
2014-07-08 16:12:25 +01:00
Deb Mukherjee
8a50b60c9c Change quantizer to start from 4
Fixes lossless quantizer

Change-Id: I2b9d00afc65c5a374f7b988c5082738d4b1a0bea
2014-07-08 08:10:22 -07:00
Deb Mukherjee
d5f6cd1b38 Changes quantization values for 10/12 bit
Improves performance by eliminating some values at the very low
quantizer range, because entropy coding is not too efficient
at this range.

Change-Id: I3cacc7352dc2e58cfe8448d89a693e992ef93ee7
2014-07-07 12:30:43 -07:00
Peter de Rivaz
91c222491e Alternate high bitdepth quantizer changes
In this proposal, the qindex range is kept at 0 to 255
but the values are remapped to cover an extended range of
quantizer values.

This simplifies the code and bitstream compared to the 8-bit version.

Change-Id: I0dda61388cef41e21a0d5c34d817c862de637580
2014-07-03 13:53:02 +01:00
Debargha Mukherjee
2cf1232bd6 Merge "Modify initial value for avg_frame_qindex" into highbitdepth 2014-07-02 14:30:32 -07:00
Peter de Rivaz
3cd669c548 Modify initial value for avg_frame_qindex
In high bitdepth mode there is an extended quantizer range.
This means that it takes longer for the avg_frame_qindex
to ramp up and this results in coding loss for short
low bitrate sequences.

This patch adds a boost in high bitdepth mode to compensate
for this effect.

This helps with the bowing_cif.y4m sequence at 50kbps.

Change-Id: Ie1575d88e8de4f0297cf86da50eb36dfc5442c70
2014-07-02 20:50:21 +01:00
Peter de Rivaz
56f2cb9478 Added config-enable-hw-high
In the inverse transform code, overflow results in
implementation defined behaviour.

Hardware implementations are expected to wrap on
overflow in order to use the smallest data path
possible.  The new config option makes the C reference code
use wrap on overflow.

Encoders are recommended to not generate bitstreams that
result in overflow in the inverse transform.

Change-Id: I73836ccfb925b4dbfb29c257a3784fa7733db877
2014-07-02 20:20:08 +01:00
Peter de Rivaz
6dd6bb6602 Added SSE2 intra prediction high bitdepth
Change-Id: I2c02a6b42378e130f693aadbe92e85727b28ec7c
2014-07-02 15:07:16 +01:00
Deb Mukherjee
225c60848b Adds comment why avx2 removed for high transforms
Change-Id: I87efe103017e80414082830987343a053e1e8cd8
2014-07-02 06:05:12 -07:00
Peter de Rivaz
67dd2eaff7 Added SSE2 high bitdepth convolution
Change-Id: I525cda28b273aa7ef4fcb2c8f690273cb719562c
2014-07-02 12:11:35 +01:00
Debargha Mukherjee
d61fbfd760 Merge "AVX2 optimization is not valid with HIGH_TRANSFORMS" into highbitdepth 2014-07-01 09:49:21 -07:00
Peter de Rivaz
6c9426809f AVX2 optimization is not valid with HIGH_TRANSFORMS
With CONFIG_HIGH_TRANSFORMS, the transform coefficients
are held in 32bit values, so the assembler code for
vp9_block_error can no longer be used.

Change-Id: I2c7d8a81a32fe4dfcac045588ea1c9e89a267f1f
2014-07-01 15:55:28 +01:00
Deb Mukherjee
7f0eaadadc Merge "Extending y4m writing" into highbitdepth 2014-06-30 16:49:18 -07:00
Deb Mukherjee
7051f38e5a Extending y4m writing
Also adds some cleanups.

Change-Id: I0b35c6645236c7ca3f05e3ed9eae4571fd8249d2
2014-06-27 15:35:36 -07:00
Peter de Rivaz
d21a89895f Use high_precision_mv for highbitdepth
The use of high precision motion vectors depends
on the qindex level.  For highbitdepth it improves PSNR
to use high precision motion vectors for more values of
qindex.

Change-Id: I1dc5a4cc38dc0ac71a2a37dfd478f37d7c361b19
2014-06-24 12:25:33 -07:00
Debargha Mukherjee
149c891dac Merge "Merge branch 'master' into highbitdepth" into highbitdepth 2014-06-24 11:28:56 -07:00
James Hutchinson
939f871ccc Merge branch 'master' into highbitdepth
Conflicts:
	configure
	test/dct16x16_test.cc
	test/dct32x32_test.cc
	test/fdct4x4_test.cc
	test/fdct8x8_test.cc
	test/partial_idct_test.cc
	vp9/common/vp9_blockd.h
	vp9/common/vp9_idct.h
	vp9/common/vp9_rtcd_defs.pl
	vp9/decoder/vp9_decodeframe.c
	vp9/encoder/vp9_aq_complexity.c
	vp9/encoder/vp9_block.h
	vp9/encoder/vp9_dct.c
	vp9/encoder/vp9_encodeframe.c
	vp9/encoder/vp9_encodemb.c
	vp9/encoder/vp9_encoder.c
	vp9/encoder/vp9_firstpass.c
	vp9/encoder/vp9_pickmode.c
	vp9/encoder/vp9_quantize.c
	vp9/encoder/vp9_quantize.h
	vp9/encoder/vp9_ratectrl.c
	vp9/vp9_cx_iface.c

Change-Id: I402e1e91c6207c41a5bc1508ccfceec62196772b
2014-06-24 09:58:17 -07:00
Peter de Rivaz
6eb5e8ded6 Fix fps messages for 8bit mode
The addition of the high bitdepth image shifting
broke the fps measurement for the 8bit encoder.

Change-Id: I70144fe5c9a821b582d461091c616b0a3f666cbe
2014-06-24 14:20:27 +01:00
Peter de Rivaz
a13f2137fa Corrected check for valid upshifts
When converting 8-bit input streams to 10 or 12 bitdepth
an error message was incorrectly triggering.

Change-Id: I6252e2a6c8304863cc71b8764830c713ccee2ff2
2014-06-19 09:12:03 +01:00
Debargha Mukherjee
8ba0eeba1b Merge "Extended quantizer range for high bitdepth" into highbitdepth 2014-06-19 00:00:55 -07:00
Peter de Rivaz
8ca39ede47 Extended quantizer range for high bitdepth
These changes allow 10 and 12 bit depth streams
to encode at higher quality by using a finer
quantizer.  Category 6 tokens now transmit 18
extra bits instead of 14 in order to be able to
encode the greater range of output coefficients.

The extended quantizer range is only used when
configured with the following options:
--enable-vp9-high
--enable-high-transforms
--enable-high-quant

Change-Id: I58d2981676d67b65cc022e98cf443603d38ba6ff
2014-06-18 13:36:36 +01:00
Debargha Mukherjee
db55558e0b Merge "Allow encoding 10 bit input at 12 bitdepth" into highbitdepth 2014-06-17 09:43:37 -07:00
Peter de Rivaz
4c017c00e6 Allow encoding 10 bit input at 12 bitdepth
Change-Id: I85c3a08e97e738237527762d323ce22522ded304
2014-06-16 10:58:37 +01:00
Deb Mukherjee
20e745e8e4 Cosmetic cleanups
Use bit_depth_to_bps() inline function for consistency

Change-Id: Id79c8a82d40eab8fd87526a165837f7618e33993
2014-06-12 17:13:24 -07:00
Debargha Mukherjee
e0305995f3 Merge "Corrected highbitdepth aq variance" into highbitdepth 2014-06-12 08:31:32 -07:00
Peter de Rivaz
7ea0334f9a Corrected highbitdepth aq variance
Change-Id: Idfa10a8f92e2a4bd4c75cda68f3b800f119a4b1e
2014-06-12 13:44:23 +01:00
Peter de Rivaz
65d2615208 Fixed overflow in highbitdepth deblocking
Change-Id: Ia9f592e9ca3a1b8414c6bb39541a7fbdada4702a
2014-06-12 13:35:10 +01:00
Deb Mukherjee
9b33e1088f Adding error checking for 422/444 inputs.
For 422 and 444 inputs, adds checks for profile.

Change-Id: I1d8e1120d4214101ba9c27b81d4381dc61b22de5
2014-06-11 11:43:58 -07:00
Deb Mukherjee
8b72b71c1c Merge "Reworks high-bit-depth profiles" into highbitdepth 2014-06-11 06:04:14 -07:00
Deb Mukherjee
093a32ffd7 Reworks high-bit-depth profiles
Splits profile 2 into Profile 2 and 3, where profile 2
ony supports 420 sampling, while profile 3 adds 422/444 and
alpha. Keeps room for further expansion.

Also makes some minor changes in the decoder parameters,
replacing --convert-to-8bit with output-bit-depth.

Change-Id: I713525880512de6c36698d212795db1543c1d0dd
2014-06-10 17:30:45 -07:00
Peter de Rivaz
321bd42060 Improved highbitdepth RDO
Change-Id: I5bc6e94d9f3f64b2467f357da0d097347ad5f0c6
2014-06-10 16:52:31 -07:00
Deb Mukherjee
f08489e609 Better block error computation
Shift is applied after high-precision arithmetic rather than
before.

Change-Id: Ibd178fe8d10600935f6d5e790e89f3b2f8b4afcf
2014-06-09 17:41:42 -07:00
Peter de Rivaz
4320ac26ee Fixed overflow in high transforms
Added extremal forward and inverse tests
for the 8x8 DCT.

Change-Id: I5445c6449b0a9bda1359072617b915446510db69
2014-06-09 09:53:45 +01:00
Deb Mukherjee
e91d29dea3 Alternate rounding
Improves performance on derf by 0.89% for 10-bit internal
and by 0.55% for 12-bit internal, both for 8-bit sources.

Change-Id: I181fd9fb10e2259233d67cdd7933fb3cae334afc
2014-06-06 05:10:31 -07:00
Peter de Rivaz
091829d376 Scaled deblocking limits for highbitdepth
Slightly improves PSNR for some sequences.
Also fixes a bug.

Change-Id: Ibc4c2f5f5c280470c99dad642d153bd91cd90798
2014-06-05 03:22:28 -07:00
Peter de Rivaz
666fd1300c Added high precision transforms
The high precision are only used if
configured with --enable-high-transforms

It gives greater precision in the transform.
This gives PSNR improvements when encoding
true 10 and 12 bit streams.

At the moment, the quantizer used is shifted
up by 2/4 for 10/12 bits so that the quantized
coefficients fit in the current token range.

Change-Id: Ia9c19a417cf030b8a7a889fcb3f5788bfca8215f
2014-06-05 01:09:32 -07:00
Debargha Mukherjee
47354ee2f4 Merge "Fixed C postproc implementation" into highbitdepth 2014-06-04 11:23:40 -07:00
Debargha Mukherjee
0c95fcc25c Merge "Corrected highbitdepth temporal filter" into highbitdepth 2014-06-04 11:22:45 -07:00
Peter de Rivaz
3224fd3c66 Fixed C postproc implementation
Destination stride was not used correctly.

Change-Id: I503d037608fdcde3f433a87e103426ad4a6b9ef4
2014-06-04 17:28:24 +01:00
Peter de Rivaz
822c27cd42 Corrected highbitdepth temporal filter
Change-Id: Ic9aa3672ce0fe257133938b8e3a93f28d3cbb877
2014-06-04 16:07:24 +01:00
Peter de Rivaz
42a1a3e3ba Fix temporal filter for 422
Change-Id: I8f2fb8b8860010f0460a200c31edec4a17c773d8
2014-06-04 13:17:46 +01:00
Deb Mukherjee
51790ab228 Some code cleanups
Removes duplicate enums and other cosmetic changes.

Change-Id: Ic8b47534ac3b2b554a79ff1437fbe5f0503a5732
2014-06-03 16:20:23 -07:00
Peter de Rivaz
1ff621ec99 Added support for 10/12bit y4m
vpxenc now accepts high bitdepth y4m files
vpxdec now produces high bitdepth y4m files
(Only if the bitstream is high bitdepth)
Can force 8bit output via --covert-to-8bit

Change-Id: Ife9fc40772aceed32ba47d7ca81024ed09563721
2014-06-03 17:46:06 +01:00
Peter de Rivaz
b7649e15c2 Improved highbitdepth temporal filter
Increasing the strength when filtering
frames for highbitdepth gives a PSNR gain.

Change-Id: Iee2f776c1ae7b8da2ca4d5e58bb11fd0be2483f4
2014-06-03 14:22:43 +01:00
Peter de Rivaz
a79f06696d Changed rounding in high bitdepth
Doing the round before the subtraction
gives PSNR improvements.

Change-Id: Ic6fd16a9df9b9fdc2a4fa4140e8fa0994bd261a0
2014-06-03 14:07:10 +01:00
Peter de Rivaz
2baec56312 Tidied high bitdepth variance
Refactored code to match current style on master.
Also fixed a bug where some sse results were
not being shifted for high bitdepth.
Also increased internal bitdepth for variance to
avoid saturation.
Also added rounding for variance when computing
shifted results.

Change-Id: I322bbc1b9abe82c8ef72ab97991720240ddf755c
2014-06-03 12:47:17 +01:00
Deb Mukherjee
eb863b46f3 Reworks PSNR/SSIM to work on source bit-depth
Change-Id: Ifcd31186b67a57d57abd112d64d163c7b76728e9
2014-05-29 17:52:28 -07:00
Deb Mukherjee
5a2a78117f Adds input bit depth parameter and refactoring
Adds a place holder input bit depth parameter. Also
implements the bit-depth parameter and the new paramter
directly using config paramters rather than controls.
That makes it more convenient to handle.

Change-Id: Ie5bdc2d8eb5627d7a5f520b3d241aac5395dcf3d
2014-05-27 22:34:30 -07:00
Peter de Rivaz
cab30216a5 Merge branch 'master' into highbitdepth
Merges from master (May 26) to highbitdepth.

Change-Id: I553888a7b169b48e7bea07325d1127627a8f944e
2014-05-27 09:53:35 -07:00
Peter de Rivaz
efd115c415 Merge commit '9e7b09bc' into highbitdepth
Change-Id: I0376c867e7abfa7713ac6e7a4e604c8384fff58b
2014-05-23 14:06:31 -07:00
Deb Mukherjee
edd1fa0487 Rename test-high-internal to test-16bit-internal
More intuitive parameter naming.

Change-Id: Ie99ed54f5e832aa4c3893612c396b2b78722e275
2014-05-18 05:38:26 -07:00
Deb Mukherjee
747f0e3b8e Deprecates --input-shift parameter
Deprecates --input-shift parameter and instead derives that from
bit-depth assuming 8-bit input source. Eventually this needs to be
derived from the input-bit-depth parameter once we support high
bit depth input. Another parameter --test-high-internal is added
to force use of 16 bit internal buffers for testing purposes, in
profile 0 and 1.

Also --bit-depth parameter now uses values 8/10/12 which is more
intuitive than 0/1/2.

Also includes some cleanups.

Change-Id: I0bdd6d9caae8bb339d217551bb35a001057805ec
2014-05-16 22:32:22 -07:00
Debargha Mukherjee
094f0024c3 Merge changes Ida70ca48,Ieb2945bb into highbitdepth
* changes:
  Added rounding when using --output-shift
  Changed --output-shift option
2014-05-15 14:36:17 -07:00
Debargha Mukherjee
018173cf91 Merge "Fixed temporal filter rounding" into highbitdepth 2014-05-15 14:35:50 -07:00
Peter de Rivaz
87a571a34e Fixed temporal filter rounding
The temporal filter goes wrong
when strength is equal to 0.

See Issue 787.

Change-Id: I983c5983c34359ca78743a2434fb536c3a9b3e72
2014-05-15 16:23:55 +01:00
Peter de Rivaz
e037f0fa0c Fixed --test-decode for high bitdepth
Change-Id: I02b72dafad80a22ff0546a64c4a0fe757ac66861
2014-05-15 16:13:11 +01:00
Peter de Rivaz
4cba43ac3d Added rounding when using --output-shift
PSNR values are slightly improved by using
rounding when shifting images back to 8bit.

Change-Id: Ida70ca48588f933a92f906cd1ebc8e88134c07f5
2014-05-14 13:41:13 -07:00
Peter de Rivaz
aa2d8ca7e2 Changed --output-shift option
--output-shift now infers the amount to shift
from the bitdepth of the decoded file.

Option has been renamed to
--convert-to-8bit

Change-Id: Ieb2945bb282bb81c52a0c4f1b691c82bec5ec18c
2014-05-14 21:32:35 +01:00
Peter de Rivaz
c4a5ef1ced Added rounding to highbitdepth subtraction
This improves encoding performance for high
bitrate sequences when using 10 or 12bit.

Change-Id: I358d30a69251d58589c075b7d52c0d9ae76b26ee
2014-05-12 11:54:09 +01:00
Peter de Rivaz
99e1518a16 Added high bitdepth decoder scaling support
Change-Id: Id88da51d7f200ff347658140be3b5f6fe2d78121
2014-05-12 10:54:47 +01:00
Peter de Rivaz
83e566029c Changed debug function to also work with 16bit YUV
Change-Id: I14cd322d6e360dcd6499e0dc9cfdf44d9f6336e8
2014-05-12 10:52:48 +01:00
Peter de Rivaz
2535202902 Decoder can now output 16bit YUV
Change-Id: I9c390030571388fe4e9a463a3ee959364c9c7386
2014-05-12 10:49:36 +01:00
Peter de Rivaz
a9ed7c4434 Added high bitdepth support for decoder postproc
Change-Id: Id1fdf5c3c75b6b92a144f06284a7e0051345e60b
2014-05-12 10:46:44 +01:00
Peter de Rivaz
6956f43a90 Fixed bug in scale_and_extend_frame for 16bit
When configured in high bitdepth mode,
scale_and_extend_frame was always calling the
16bit code instead of dynamically switching.

Change-Id: I0542398d214a091d0740615689c786026aacedd6
2014-05-08 15:09:07 +01:00
Peter de Rivaz
9d427884e9 Fixed libyuv image copy for high bitdepth
When scaling and extending frames in high bitdepth,
there is a special case when no scaling is required.
This triggers a code path that calls CopyPlane16.
In this case the high bitdepth code was not
copying enough bytes.

Change-Id: I0fe2dc667ca8d7b0d03c0290a5716d53309c8198
2014-05-08 15:04:51 +01:00
Peter de Rivaz
ed352156c9 Added missing 8-bit image formats to vpx_image
This is needed for the high bitdepth work because
we use vpx image to allocate an output 8bit buffer.

Change-Id: I5404a628e9d207bf0a3a94bdb514611c836b68bd
2014-05-06 05:12:10 -07:00
Peter de Rivaz
df64b3d04a Added high bitdepth resize plane function.
Change-Id: I9b9d95fbed19f711a54a22d3181aca109a69f8f3
2014-05-06 05:12:00 -07:00
Peter de Rivaz
11f75a26c1 Use 16bit scaling function in vpxenc.
Change-Id: Ib8b25b558c0a0c1bc87863f592ea043cf65cc792
2014-05-06 05:11:51 -07:00
Peter de Rivaz
81758337d5 Added high bitdepth ssim functions.
Change-Id: Ib52d6882e1b9e58cd41d2771258a7a8c959730ec
2014-05-06 13:10:19 +01:00
Peter de Rivaz
4e016f6e21 Added call to high bitdepth sse in partition fn.
Change-Id: I647522cf1a633ec9a9db12f163c75f572081ecb7
2014-05-06 02:13:27 -07:00
Peter de Rivaz
22fbe77710 Corrected block_variance computation for high bitdepth.
Change-Id: I84ffe759b2a32ba208f915684a7a75f7f78ffa0b
2014-05-06 02:13:19 -07:00
Peter de Rivaz
d26ae35646 Fixed PSNR calculation for high bitdepth.
Change-Id: I7b60bd8b1c7a67aabf152f2f6f0c1ff5a7fbb43c
2014-05-06 02:12:48 -07:00
Peter de Rivaz
0b8ae49a05 Add control to read bit_depth
This is needed to calculate high bitdepth PSNR in vpxenc.c.

Change-Id: I60cea0e0a263e33ee1a8706517131c4b9fa1aafd
2014-05-06 02:12:21 -07:00
Peter de Rivaz
d868780eb9 Added high bitdepth support to postprocessing deblock and denoise.
Change-Id: I68d5521349dde2bc1832562cfd6f879966b8fcf1
2014-05-06 10:10:47 +01:00
Peter de Rivaz
bc0bc688c2 Added 16bit scaling functions to libyuv
This is an unoptimized C implementation of a 16bit
scaling function.

Change-Id: I4241442dde3cbf347988c555776a5cdd0189bb4d
2014-05-01 12:05:39 +01:00
Peter de Rivaz
a242def2aa Adding sad function generation macros
Tidying up the high bitdepth SAD implementation to match
the corresponding change 69783 on master.

Change-Id: I9c415a996a3ff237b2d25c57ad874284a45793fc
2014-05-01 11:49:44 +01:00
Peter de Rivaz
a0c772e381 Fix SAD adjustment to stop overrunning array
Change-Id: Ib98f5688abc80ebbaa6512f9b052b40640507744
2014-05-01 11:34:14 +01:00
Peter de Rivaz
d3e62b846b Template macros to generate subpix variance functions.
Tidying up the high bitdepth variance implementation to match
the corresponding change 69840 on master.

Change-Id: I213d28950e63cef7b9664639bc266f6a6a99c5f5
2014-05-01 10:23:12 +01:00
Peter de Rivaz
bdd7f74c3f Added initial support for 16-bit framebuffers.
Changes in this patch are only enabled if configured with
--enable-experimental --enable-vp9_high

Using a encoder command line argument of --input-shift=0 tells the coder
to work with 16bit framebuffers.
The output should be identical to before. Some features (such as input
image resizing) are not yet supported in 16bit mode.

Specifically, the behavior of the input-shift parameter is as follows:
* No argument : Behaviour as before, using 8bit frame buffers
* --experimental-bitstream --profile=2 --input-shift=0: Uses
  16bit frame buffers to store 8-bit data, should give identical output
  to before.
* --experimental-bitstream --profile=2 --input-shift=2 --bit-depth=1: Uses
  16bit frame buffers to store 10-bit data, encodes a version 2 stream
  with bitdepth 10
* --experimental-bitstream --profile=2 --input-shift=4 --bit-depth=2: Uses
  16bit frame buffers to store 12-bit data, encodes a version 2 stream
  with bitdepth 12

The decoder has an --output-shift argument which should be used when
decoding profile 2 streams.

So far support for the following has been added:
Intra filtering
Deblocking
Motion compensation
Variance calculation
Sad calculation
Transform

Change-Id: If345c88234aafdd40caea0d88935b1f07aaebe22
2014-04-28 22:11:19 -07:00
35 changed files with 9424 additions and 2644 deletions

View File

@@ -264,6 +264,8 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
Idct16x16Param;
void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
int /*tx_type*/) {
@@ -311,6 +313,32 @@ void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
}
void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct16x16_10_add_c(in, out, stride, 10);
}
void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct16x16_10_add_c(in, out, stride, 12);
}
#if HAVE_SSE2
void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
}
void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
}
void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
}
void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
}
#endif
#endif
class Trans16x16TestBase {
@@ -540,7 +568,7 @@ class Trans16x16TestBase {
reference_16x16_dct_2d(in, out_r);
for (int j = 0; j < kNumCoeffs; ++j)
coeff[j] = round(out_r[j]);
coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
if (bit_depth_ == VPX_BITS_8) {
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
@@ -565,6 +593,62 @@ class Trans16x16TestBase {
}
}
}
void CompareInvReference(IdctFunc ref_txfm, int thresh) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 10000;
const int eob = 10;
const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
#if CONFIG_VP9_HIGHBITDEPTH
DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
#endif
for (int i = 0; i < count_test_block; ++i) {
for (int j = 0; j < kNumCoeffs; ++j) {
if (j < eob) {
// Random values less than the threshold, either positive or negative
coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
} else {
coeff[scan[j]] = 0;
}
if (bit_depth_ == VPX_BITS_8) {
dst[j] = 0;
ref[j] = 0;
#if CONFIG_VP9_HIGHBITDEPTH
} else {
dst16[j] = 0;
ref16[j] = 0;
#endif
}
}
if (bit_depth_ == VPX_BITS_8) {
ref_txfm(coeff, ref, pitch_);
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
} else {
#if CONFIG_VP9_HIGHBITDEPTH
ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
pitch_));
#endif
}
for (int j = 0; j < kNumCoeffs; ++j) {
#if CONFIG_VP9_HIGHBITDEPTH
const uint32_t diff =
bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
#else
const uint32_t diff = dst[j] - ref[j];
#endif
const uint32_t error = diff * diff;
EXPECT_EQ(0u, error)
<< "Error: 16x16 IDCT Comparison has error " << error
<< " at index " << j;
}
}
}
int pitch_;
int tx_type_;
vpx_bit_depth_t bit_depth_;
@@ -590,10 +674,10 @@ class Trans16x16DCT
mask_ = (1 << bit_depth_) - 1;
#if CONFIG_VP9_HIGHBITDEPTH
switch (bit_depth_) {
case 10:
case VPX_BITS_10:
inv_txfm_ref = idct16x16_10_ref;
break;
case 12:
case VPX_BITS_12:
inv_txfm_ref = idct16x16_12_ref;
break;
default:
@@ -703,6 +787,37 @@ TEST_P(Trans16x16HT, QuantCheck) {
RunQuantCheck(429, 729);
}
class InvTrans16x16DCT
: public Trans16x16TestBase,
public ::testing::TestWithParam<Idct16x16Param> {
public:
virtual ~InvTrans16x16DCT() {}
virtual void SetUp() {
ref_txfm_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
thresh_ = GET_PARAM(2);
bit_depth_ = GET_PARAM(3);
pitch_ = 16;
mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride);
}
IdctFunc ref_txfm_;
IdctFunc inv_txfm_;
int thresh_;
};
TEST_P(InvTrans16x16DCT, CompareReference) {
CompareInvReference(ref_txfm_, thresh_);
}
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -772,6 +887,51 @@ INSTANTIATE_TEST_CASE_P(
VPX_BITS_8)));
#endif
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16DCT,
::testing::Values(
make_tuple(&vp9_highbd_fdct16x16_sse2,
&idct16x16_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct16x16_c,
&idct16x16_256_add_10_sse2, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct16x16_sse2,
&idct16x16_12, 0, VPX_BITS_12),
make_tuple(&vp9_highbd_fdct16x16_c,
&idct16x16_256_add_12_sse2, 0, VPX_BITS_12),
make_tuple(&vp9_fdct16x16_sse2,
&vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16HT,
::testing::Values(
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12),
make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 3,
VPX_BITS_8)));
// Optimizations take effect at a threshold of 3155, so we use a value close to
// that to test both branches.
INSTANTIATE_TEST_CASE_P(
SSE2, InvTrans16x16DCT,
::testing::Values(
make_tuple(&idct16x16_10_add_10_c,
&idct16x16_10_add_10_sse2, 3167, VPX_BITS_10),
make_tuple(&idct16x16_10,
&idct16x16_256_add_10_sse2, 3167, VPX_BITS_10),
make_tuple(&idct16x16_10_add_12_c,
&idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
make_tuple(&idct16x16_12,
&idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
#endif
#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSSE3, Trans16x16DCT,

View File

@@ -79,6 +79,10 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
Trans32x32Param;
#if CONFIG_VP9_HIGHBITDEPTH
void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct32x32_1024_add_c(in, out, stride, 8);
}
void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct32x32_1024_add_c(in, out, stride, 10);
}
@@ -114,7 +118,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
uint32_t max_error = 0;
int64_t total_error = 0;
const int count_test_block = 1000;
const int count_test_block = 10000;
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
@@ -127,7 +131,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
if (bit_depth_ == 8) {
if (bit_depth_ == VPX_BITS_8) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
test_input_block[j] = src[j] - dst[j];
@@ -282,7 +286,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
reference_32x32_dct_2d(in, out_r);
for (int j = 0; j < kNumCoeffs; ++j)
coeff[j] = round(out_r[j]);
coeff[j] = static_cast<tran_low_t>(round(out_r[j]));
if (bit_depth_ == VPX_BITS_8) {
ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
#if CONFIG_VP9_HIGHBITDEPTH
@@ -353,6 +357,22 @@ INSTANTIATE_TEST_CASE_P(
&vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
#endif
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans32x32Test,
::testing::Values(
make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_10, 1,
VPX_BITS_10),
make_tuple(&vp9_highbd_fdct32x32_sse2, &idct32x32_12, 0, VPX_BITS_12),
make_tuple(&vp9_highbd_fdct32x32_rd_sse2, &idct32x32_12, 1,
VPX_BITS_12),
make_tuple(&vp9_fdct32x32_sse2, &vp9_idct32x32_1024_add_c, 0,
VPX_BITS_8),
make_tuple(&vp9_fdct32x32_rd_sse2, &vp9_idct32x32_1024_add_c, 1,
VPX_BITS_8)));
#endif
#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
AVX2, Trans32x32Test,

146
test/error_block_test.cc Normal file
View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <cmath>
#include <cstdlib>
#include <string>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
namespace {
#if CONFIG_VP9_HIGHBITDEPTH
const int number_of_iterations = 1000;
typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
const tran_low_t *dqcoeff, intptr_t block_size,
int64_t *ssz, int bps);
typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
ErrorBlockParam;
class ErrorBlockTest
: public ::testing::TestWithParam<ErrorBlockParam> {
public:
virtual ~ErrorBlockTest() {}
virtual void SetUp() {
error_block_op_ = GET_PARAM(0);
ref_error_block_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
vpx_bit_depth_t bit_depth_;
ErrorBlockFunc error_block_op_;
ErrorBlockFunc ref_error_block_op_;
};
TEST_P(ErrorBlockTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
int err_count_total = 0;
int first_failure = -1;
intptr_t block_size;
int64_t ssz;
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
for (int i = 0; i < number_of_iterations; ++i) {
int err_count = 0;
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
coeff[j] = rnd(2<<20)-(1<<20);
dqcoeff[j] = rnd(2<<20)-(1<<20);
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
bit_depth_);
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
&ssz, bit_depth_));
err_count += (ref_ret != ret) | (ref_ssz != ssz);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(ErrorBlockTest, ExtremeValues) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
int err_count_total = 0;
int first_failure = -1;
intptr_t block_size;
int64_t ssz;
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
int max_val = ((1<<20)-1);
for (int i = 0; i < number_of_iterations; ++i) {
int err_count = 0;
int k = (i / 9) % 5;
// Change the maximum coeff value, to test different bit boundaries
if ( k == 4 && (i % 9) == 0 ) {
max_val >>= 1;
}
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
if (k < 4) { // Test at maximum values
coeff[j] = k % 2 ? max_val : -max_val;
dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
} else {
coeff[j] = rnd(2 << 14) - (1 << 14);
dqcoeff[j] = rnd(2 << 14) - (1 << 14);
}
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
bit_depth_);
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
&ssz, bit_depth_));
err_count += (ref_ret != ret) | (ref_ssz != ssz);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
using std::tr1::make_tuple;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2_C_COMPARE, ErrorBlockTest,
::testing::Values(
make_tuple(&vp9_highbd_block_error_sse2,
&vp9_highbd_block_error_c, VPX_BITS_10),
make_tuple(&vp9_highbd_block_error_sse2,
&vp9_highbd_block_error_c, VPX_BITS_12),
make_tuple(&vp9_highbd_block_error_sse2,
&vp9_highbd_block_error_c, VPX_BITS_8)));
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace

View File

@@ -75,6 +75,16 @@ void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_iwht4x4_16_add_c(in, out, stride, 12);
}
#if HAVE_SSE2
void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
}
void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
}
#endif
#endif
class Trans4x4TestBase {
@@ -496,4 +506,31 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
#endif
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4DCT,
::testing::Values(
make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
make_tuple(&vp9_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
make_tuple(&vp9_fdct4x4_sse2, &vp9_idct4x4_16_add_c, 0,
VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4HT,
::testing::Values(
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12),
make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
#endif
} // namespace

View File

@@ -71,6 +71,7 @@ typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
vp9_fdct8x8_c(in, out, stride);
@@ -96,6 +97,32 @@ void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
}
void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct8x8_10_add_c(in, out, stride, 10);
}
void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct8x8_10_add_c(in, out, stride, 12);
}
#if HAVE_SSE2
void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
}
void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
}
void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
}
void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vp9_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
}
#endif
#endif
class FwdTrans8x8TestBase {
@@ -146,9 +173,10 @@ class FwdTrans8x8TestBase {
memset(count_sign_block, 0, sizeof(count_sign_block));
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-15, 15].
// Initialize a test block with input range [-mask_/16, mask_/16].
for (int j = 0; j < 64; ++j)
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
test_input_block[j] = ((rnd.Rand16() & mask_) >> 4) -
((rnd.Rand16() & mask_) >> 4);
ASM_REGISTER_STATE_CHECK(
RunFwdTxfm(test_input_block, test_output_block, pitch_));
@@ -188,7 +216,7 @@ class FwdTrans8x8TestBase {
#endif
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
// Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < 64; ++j) {
if (bit_depth_ == VPX_BITS_8) {
src[j] = rnd.Rand8();
@@ -427,6 +455,63 @@ class FwdTrans8x8TestBase {
}
}
}
void CompareInvReference(IdctFunc ref_txfm, int thresh) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 10000;
const int eob = 12;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
#if CONFIG_VP9_HIGHBITDEPTH
DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
#endif
const int16_t *scan = vp9_default_scan_orders[TX_8X8].scan;
for (int i = 0; i < count_test_block; ++i) {
for (int j = 0; j < kNumCoeffs; ++j) {
if (j < eob) {
// Random values less than the threshold, either positive or negative
coeff[scan[j]] = rnd(thresh) * (1-2*(i%2));
} else {
coeff[scan[j]] = 0;
}
if (bit_depth_ == VPX_BITS_8) {
dst[j] = 0;
ref[j] = 0;
#if CONFIG_VP9_HIGHBITDEPTH
} else {
dst16[j] = 0;
ref16[j] = 0;
#endif
}
}
if (bit_depth_ == VPX_BITS_8) {
ref_txfm(coeff, ref, pitch_);
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
#if CONFIG_VP9_HIGHBITDEPTH
} else {
ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);
ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
pitch_));
#endif
}
for (int j = 0; j < kNumCoeffs; ++j) {
#if CONFIG_VP9_HIGHBITDEPTH
const uint32_t diff =
bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
#else
const uint32_t diff = dst[j] - ref[j];
#endif
const uint32_t error = diff * diff;
EXPECT_EQ(0u, error)
<< "Error: 8x8 IDCT has error " << error
<< " at index " << j;
}
}
}
int pitch_;
int tx_type_;
FhtFunc fwd_txfm_ref;
@@ -526,6 +611,38 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) {
RunExtremalCheck();
}
class InvTrans8x8DCT
: public FwdTrans8x8TestBase,
public ::testing::TestWithParam<Idct8x8Param> {
public:
virtual ~InvTrans8x8DCT() {}
virtual void SetUp() {
ref_txfm_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
thresh_ = GET_PARAM(2);
pitch_ = 8;
bit_depth_ = GET_PARAM(3);
mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride);
}
void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
IdctFunc ref_txfm_;
IdctFunc inv_txfm_;
int thresh_;
};
TEST_P(InvTrans8x8DCT, CompareReference) {
CompareInvReference(ref_txfm_, thresh_);
}
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -598,6 +715,45 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
#endif
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_highbd_fdct8x8_c,
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct8x8_sse2,
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct8x8_c,
&idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
make_tuple(&vp9_highbd_fdct8x8_sse2,
&idct8x8_64_add_12_sse2, 12, VPX_BITS_12),
make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
// Optimizations take effect at a threshold of 6201, so we use a value close to
// that to test both branches.
INSTANTIATE_TEST_CASE_P(
SSE2, InvTrans8x8DCT,
::testing::Values(
make_tuple(&idct8x8_10_add_10_c,
&idct8x8_10_add_10_sse2, 6225, VPX_BITS_10),
make_tuple(&idct8x8_10,
&idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
make_tuple(&idct8x8_10_add_12_c,
&idct8x8_10_add_12_sse2, 6225, VPX_BITS_12),
make_tuple(&idct8x8_12,
&idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
#endif
#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(

View File

@@ -23,6 +23,8 @@
#include "vp9/common/vp9_entropy.h"
#include "vpx/vpx_integer.h"
#define MAX_LOOP_FILTER 63
using libvpx_test::ACMRandom;
namespace {
@@ -51,8 +53,9 @@ typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *thresh1);
#endif // CONFIG_VP9_HIGHBITDEPTH
typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
typedef std::tr1::tuple<loop_op_t, loop_op_t, vpx_bit_depth_t> loop8_param_t;
typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t,
vpx_bit_depth_t> dualloop8_param_t;
#if HAVE_SSE2
#if CONFIG_VP9_HIGHBITDEPTH
@@ -119,7 +122,7 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
int bit_depth_;
vpx_bit_depth_t bit_depth_;
int mask_;
loop_op_t loopfilter_op_;
loop_op_t ref_loopfilter_op_;
@@ -138,7 +141,7 @@ class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
int bit_depth_;
vpx_bit_depth_t bit_depth_;
int mask_;
dual_loop_op_t loopfilter_op_;
dual_loop_op_t ref_loopfilter_op_;
@@ -148,7 +151,7 @@ TEST_P(Loop8Test6Param, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = number_of_iterations;
#if CONFIG_VP9_HIGHBITDEPTH
int32_t bd = bit_depth_;
vpx_bit_depth_t bd = bit_depth_;
DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
#else
@@ -160,11 +163,18 @@ TEST_P(Loop8Test6Param, OperationCheck) {
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
uint8_t tmp = rnd.Rand8();
// mblim <= 3 * MAX_LOOP_FILTER + 4
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, limit[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -211,7 +221,7 @@ TEST_P(Loop8Test6Param, OperationCheck) {
ASM_REGISTER_STATE_CHECK(
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
#else
ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count);
ASM_REGISTER_STATE_CHECK(
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -234,7 +244,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = number_of_iterations;
#if CONFIG_VP9_HIGHBITDEPTH
const int32_t bd = bit_depth_;
vpx_bit_depth_t bd = bit_depth_;
DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
#else
@@ -246,11 +256,17 @@ TEST_P(Loop8Test6Param, ValueCheck) {
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
uint8_t tmp = rnd.Rand8();
while (tmp > 3*MAX_LOOP_FILTER + 4) { // mblim <= 3*MAX_LOOP_FILTER + 4
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, limit[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -271,7 +287,7 @@ TEST_P(Loop8Test6Param, ValueCheck) {
ASM_REGISTER_STATE_CHECK(
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
#else
ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count);
ASM_REGISTER_STATE_CHECK(
loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -293,7 +309,7 @@ TEST_P(Loop8Test9Param, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = number_of_iterations;
#if CONFIG_VP9_HIGHBITDEPTH
const int32_t bd = bit_depth_;
vpx_bit_depth_t bd = bit_depth_;
DECLARE_ALIGNED_ARRAY(16, uint16_t, s, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_s, kNumCoeffs);
#else
@@ -305,11 +321,19 @@ TEST_P(Loop8Test9Param, OperationCheck) {
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
uint8_t tmp = rnd.Rand8();
// mblim <= 3 * MAX_LOOP_FILTER + 4
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
// lim <= MAX_LOOP_FILTER
while (tmp > MAX_LOOP_FILTER) {
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -320,11 +344,18 @@ TEST_P(Loop8Test9Param, OperationCheck) {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
// mblim <= 3 * MAX_LOOP_FILTER + 4
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -407,11 +438,18 @@ TEST_P(Loop8Test9Param, ValueCheck) {
for (int i = 0; i < count_test_block; ++i) {
int err_count = 0;
uint8_t tmp = rnd.Rand8();
// mblim <= 3 * MAX_LOOP_FILTER + 4
while (tmp > 3 * MAX_LOOP_FILTER + 4) {
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -422,11 +460,17 @@ TEST_P(Loop8Test9Param, ValueCheck) {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
while (tmp > 3 * MAX_LOOP_FILTER + 4) { // mblim <= 3*MAX_LOOP_FILTER + 4
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
tmp = rnd.Rand8();
while (tmp > MAX_LOOP_FILTER) { // lim <= MAX_LOOP_FILTER
tmp = rnd.Rand8();
}
DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = {
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
@@ -442,7 +486,7 @@ TEST_P(Loop8Test9Param, ValueCheck) {
ref_s[j] = s[j];
}
#if CONFIG_VP9_HIGHBITDEPTH
const int32_t bd = bit_depth_;
vpx_bit_depth_t bd = bit_depth_;
ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0,
blimit1, limit1, thresh1, bd);
ASM_REGISTER_STATE_CHECK(
@@ -477,48 +521,51 @@ INSTANTIATE_TEST_CASE_P(
SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
::testing::Values(
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
&vp9_highbd_lpf_horizontal_4_c, 8),
&vp9_highbd_lpf_horizontal_4_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
&vp9_highbd_lpf_vertical_4_c, 8),
&vp9_highbd_lpf_vertical_4_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
&vp9_highbd_lpf_horizontal_8_c, 8),
&vp9_highbd_lpf_horizontal_8_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
&vp9_highbd_lpf_horizontal_16_c, 8),
&vp9_highbd_lpf_horizontal_16_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
&vp9_highbd_lpf_vertical_8_c, 8),
&vp9_highbd_lpf_vertical_8_c, VPX_BITS_8),
make_tuple(&wrapper_vertical_16_sse2,
&wrapper_vertical_16_c, 8),
&wrapper_vertical_16_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
&vp9_highbd_lpf_horizontal_4_c, 10),
&vp9_highbd_lpf_horizontal_4_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
&vp9_highbd_lpf_vertical_4_c, 10),
&vp9_highbd_lpf_vertical_4_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
&vp9_highbd_lpf_horizontal_8_c, 10),
&vp9_highbd_lpf_horizontal_8_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
&vp9_highbd_lpf_horizontal_16_c, 10),
&vp9_highbd_lpf_horizontal_16_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
&vp9_highbd_lpf_vertical_8_c, 10),
&vp9_highbd_lpf_vertical_8_c, VPX_BITS_10),
make_tuple(&wrapper_vertical_16_sse2,
&wrapper_vertical_16_c, 10),
&wrapper_vertical_16_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
&vp9_highbd_lpf_horizontal_4_c, 12),
&vp9_highbd_lpf_horizontal_4_c, VPX_BITS_12),
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
&vp9_highbd_lpf_vertical_4_c, 12),
&vp9_highbd_lpf_vertical_4_c, VPX_BITS_12),
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
&vp9_highbd_lpf_horizontal_8_c, 12),
&vp9_highbd_lpf_horizontal_8_c, VPX_BITS_12),
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
&vp9_highbd_lpf_horizontal_16_c, 12),
&vp9_highbd_lpf_horizontal_16_c, VPX_BITS_12),
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
&vp9_highbd_lpf_vertical_8_c, 12),
&vp9_highbd_lpf_vertical_8_c, VPX_BITS_12),
make_tuple(&wrapper_vertical_16_sse2,
&wrapper_vertical_16_c, 12)));
&wrapper_vertical_16_c, VPX_BITS_12)));
#else
INSTANTIATE_TEST_CASE_P(
SSE2_C_COMPARE_SINGLE, Loop8Test6Param,
::testing::Values(
make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8)));
make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c,
VPX_BITS_8),
make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c,
VPX_BITS_8),
make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c,
VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
@@ -528,60 +575,61 @@ INSTANTIATE_TEST_CASE_P(
SSE2_C_COMPARE_DUAL, Loop8Test6Param,
::testing::Values(
make_tuple(&wrapper_vertical_16_dual_sse2,
&wrapper_vertical_16_dual_c, 8),
&wrapper_vertical_16_dual_c, VPX_BITS_8),
make_tuple(&wrapper_vertical_16_dual_sse2,
&wrapper_vertical_16_dual_c, 10),
&wrapper_vertical_16_dual_c, VPX_BITS_10),
make_tuple(&wrapper_vertical_16_dual_sse2,
&wrapper_vertical_16_dual_c, 12)));
&wrapper_vertical_16_dual_c, VPX_BITS_12)));
#else
INSTANTIATE_TEST_CASE_P(
SSE2_C_COMPARE_DUAL, Loop8Test6Param,
::testing::Values(
make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c,
VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_SSE2
#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSE_C_COMPARE_DUAL, Loop8Test9Param,
SSE2_C_COMPARE_DUAL, Loop8Test9Param,
::testing::Values(
make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
&vp9_highbd_lpf_horizontal_4_dual_c, 8),
&vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
&vp9_highbd_lpf_horizontal_8_dual_c, 8),
&vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
&vp9_highbd_lpf_vertical_4_dual_c, 8),
&vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
&vp9_highbd_lpf_vertical_8_dual_c, 8),
&vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_8),
make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
&vp9_highbd_lpf_horizontal_4_dual_c, 10),
&vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
&vp9_highbd_lpf_horizontal_8_dual_c, 10),
&vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
&vp9_highbd_lpf_vertical_4_dual_c, 10),
&vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
&vp9_highbd_lpf_vertical_8_dual_c, 10),
&vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_10),
make_tuple(&vp9_highbd_lpf_horizontal_4_dual_sse2,
&vp9_highbd_lpf_horizontal_4_dual_c, 12),
&vp9_highbd_lpf_horizontal_4_dual_c, VPX_BITS_12),
make_tuple(&vp9_highbd_lpf_horizontal_8_dual_sse2,
&vp9_highbd_lpf_horizontal_8_dual_c, 12),
&vp9_highbd_lpf_horizontal_8_dual_c, VPX_BITS_12),
make_tuple(&vp9_highbd_lpf_vertical_4_dual_sse2,
&vp9_highbd_lpf_vertical_4_dual_c, 12),
&vp9_highbd_lpf_vertical_4_dual_c, VPX_BITS_12),
make_tuple(&vp9_highbd_lpf_vertical_8_dual_sse2,
&vp9_highbd_lpf_vertical_8_dual_c, 12)));
&vp9_highbd_lpf_vertical_8_dual_c, VPX_BITS_12)));
#else
INSTANTIATE_TEST_CASE_P(
SSE_C_COMPARE_DUAL, Loop8Test9Param,
SSE2_C_COMPARE_DUAL, Loop8Test9Param,
::testing::Values(
make_tuple(&vp9_lpf_horizontal_4_dual_sse2,
&vp9_lpf_horizontal_4_dual_c, 8),
&vp9_lpf_horizontal_4_dual_c, VPX_BITS_8),
make_tuple(&vp9_lpf_horizontal_8_dual_sse2,
&vp9_lpf_horizontal_8_dual_c, 8),
&vp9_lpf_horizontal_8_dual_c, VPX_BITS_8),
make_tuple(&vp9_lpf_vertical_4_dual_sse2,
&vp9_lpf_vertical_4_dual_c, 8),
&vp9_lpf_vertical_4_dual_c, VPX_BITS_8),
make_tuple(&vp9_lpf_vertical_8_dual_sse2,
&vp9_lpf_vertical_8_dual_c, 8)));
&vp9_lpf_vertical_8_dual_c, VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
} // namespace

353
test/quantize_test.cc Normal file
View File

@@ -0,0 +1,353 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
namespace {
#if CONFIG_VP9_HIGHBITDEPTH
const int number_of_iterations = 100;
typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
int skip_block, const int16_t *zbin,
const int16_t *round, const int16_t *quant,
const int16_t *quant_shift,
tran_low_t *qcoeff, tran_low_t *dqcoeff,
const int16_t *dequant, int zbin_oq_value,
uint16_t *eob, const int16_t *scan,
const int16_t *iscan);
typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
QuantizeParam;
class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
public:
virtual ~QuantizeTest() {}
virtual void SetUp() {
quantize_op_ = GET_PARAM(0);
ref_quantize_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
vpx_bit_depth_t bit_depth_;
int mask_;
QuantizeFunc quantize_op_;
QuantizeFunc ref_quantize_op_;
};
class Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
public:
virtual ~Quantize32Test() {}
virtual void SetUp() {
quantize_op_ = GET_PARAM(0);
ref_quantize_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
vpx_bit_depth_t bit_depth_;
int mask_;
QuantizeFunc quantize_op_;
QuantizeFunc ref_quantize_op_;
};
TEST_P(QuantizeTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
int skip_block = i == 0;
TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
int count = (4 << sz) * (4 << sz); // 16, 64, 256
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = rnd.Rand16()&mask_;
}
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(Quantize32Test, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
int skip_block = i == 0;
TX_SIZE sz = TX_32X32;
TX_TYPE tx_type = (TX_TYPE)(i % 4);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
int count = (4 << sz) * (4 << sz); // 1024
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = rnd.Rand16()&mask_;
}
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(QuantizeTest, EOBCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 256);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
int skip_block = i == 0;
TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16
TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
int count = (4 << sz) * (4 << sz); // 16, 64, 256
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
// Two random entries
for (int j = 0; j < count; j++) {
coeff_ptr[j] = 0;
}
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
TEST_P(Quantize32Test, EOBCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int zbin_oq_value = 0;
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, zbin_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, round_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, int16_t, quant_shift_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_qcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_dqcoeff_ptr, 1024);
DECLARE_ALIGNED_ARRAY(16, int16_t, dequant_ptr, 2);
DECLARE_ALIGNED_ARRAY(16, uint16_t, eob_ptr, 1);
DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_eob_ptr, 1);
int err_count_total = 0;
int first_failure = -1;
for (int i = 0; i < number_of_iterations; ++i) {
int skip_block = i == 0;
TX_SIZE sz = TX_32X32;
TX_TYPE tx_type = (TX_TYPE)(i % 4);
const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
int count = (4 << sz) * (4 << sz); // 1024
int err_count = 0;
*eob_ptr = rnd.Rand16();
*ref_eob_ptr = *eob_ptr;
for (int j = 0; j < count; j++) {
coeff_ptr[j] = 0;
}
// Two random entries
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
coeff_ptr[rnd(count)] = rnd.Rand16()&mask_;
for (int j = 0; j < 2; j++) {
zbin_ptr[j] = rnd.Rand16()&mask_;
round_ptr[j] = rnd.Rand16();
quant_ptr[j] = rnd.Rand16();
quant_shift_ptr[j] = rnd.Rand16();
dequant_ptr[j] = rnd.Rand16();
}
ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
ref_dqcoeff_ptr, dequant_ptr, zbin_oq_value,
ref_eob_ptr, scan_order->scan, scan_order->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(coeff_ptr, count, skip_block,
zbin_ptr, round_ptr, quant_ptr,
quant_shift_ptr, qcoeff_ptr,
dqcoeff_ptr, dequant_ptr,
zbin_oq_value, eob_ptr,
scan_order->scan, scan_order->iscan));
for (int j = 0; j < sz; ++j) {
err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
(ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
}
err_count += (*ref_eob_ptr != *eob_ptr);
if (err_count && !err_count_total) {
first_failure = i;
}
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
<< "Error: Quantization Test, C output doesn't match SSE2 output. "
<< "First failed at test case " << first_failure;
}
using std::tr1::make_tuple;
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2_C_COMPARE, QuantizeTest,
::testing::Values(
make_tuple(&vp9_highbd_quantize_b_sse2,
&vp9_highbd_quantize_b_c, VPX_BITS_8),
make_tuple(&vp9_highbd_quantize_b_sse2,
&vp9_highbd_quantize_b_c, VPX_BITS_10),
make_tuple(&vp9_highbd_quantize_b_sse2,
&vp9_highbd_quantize_b_c, VPX_BITS_12)));
INSTANTIATE_TEST_CASE_P(
SSE2_C_COMPARE, Quantize32Test,
::testing::Values(
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_8),
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_10),
make_tuple(&vp9_highbd_quantize_b_32x32_sse2,
&vp9_highbd_quantize_b_32x32_c, VPX_BITS_12)));
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace

File diff suppressed because it is too large Load Diff

View File

@@ -134,6 +134,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += quantize_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += error_block_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc
ifeq ($(CONFIG_VP9_ENCODER),yes)

File diff suppressed because it is too large Load Diff

View File

@@ -276,7 +276,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
// Note the offset is 1 less than half.
const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
int plane;
if (dst->w != src->w || dst->h != src->h ||
if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
dst->x_chroma_shift != src->x_chroma_shift ||
dst->y_chroma_shift != src->y_chroma_shift ||
dst->fmt != src->fmt || input_shift < 0) {
@@ -293,12 +293,12 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
break;
}
for (plane = 0; plane < 3; plane++) {
int w = src->w;
int h = src->h;
int w = src->d_w;
int h = src->d_h;
int x, y;
if (plane) {
w >>= src->x_chroma_shift;
h >>= src->y_chroma_shift;
w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
}
for (y = 0; y < h; y++) {
uint16_t *p_src =
@@ -316,7 +316,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
// Note the offset is 1 less than half.
const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
int plane;
if (dst->w != src->w || dst->h != src->h ||
if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
dst->x_chroma_shift != src->x_chroma_shift ||
dst->y_chroma_shift != src->y_chroma_shift ||
dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
@@ -334,8 +334,8 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
break;
}
for (plane = 0; plane < 3; plane++) {
int w = src->w;
int h = src->h;
int w = src->d_w;
int h = src->d_h;
int x, y;
if (plane) {
w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
@@ -384,8 +384,8 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
int h = src->d_h;
int x, y;
if (plane) {
w >>= src->x_chroma_shift;
h >>= src->y_chroma_shift;
w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
}
for (y = 0; y < h; y++) {
uint16_t *p_src =

View File

@@ -15,36 +15,6 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
#if CONFIG_EMULATE_HARDWARE
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
// non-normative method to handle overflows. A stream that causes
// overflows in the inverse transform is considered invalid in VP9,
// and a hardware implementer is free to choose any reasonable
// method to handle overflows. However to aid in hardware
// verification they can use a specific implementation of the
// WRAPLOW() macro below that is identical to their intended
// hardware implementation (and also use configure options to trigger
// the C-implementation of the transform).
//
// The particular WRAPLOW implementation below performs strict
// overflow wrapping to match common hardware implementations.
// bd of 8 uses trans_low with 16bits, need to remove 16bits
// bd of 10 uses trans_low with 18bits, need to remove 14bits
// bd of 12 uses trans_low with 20bits, need to remove 12bits
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
#else
#define WRAPLOW(x, bd) (x)
#endif // CONFIG_EMULATE_HARDWARE
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
int bd) {
trans = WRAPLOW(trans, bd);
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
trans = WRAPLOW(trans, 8);
return clip_pixel(WRAPLOW(dest + trans, 8));
@@ -276,10 +246,10 @@ void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
static void iadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
tran_high_t x0 = input[0];
tran_high_t x1 = input[1];
tran_high_t x2 = input[2];
tran_high_t x3 = input[3];
tran_low_t x0 = input[0];
tran_low_t x1 = input[1];
tran_low_t x2 = input[2];
tran_low_t x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
@@ -295,24 +265,19 @@ static void iadst4(const tran_low_t *input, tran_low_t *output) {
s6 = sinpi_4_9 * x3;
s7 = x0 - x2 + x3;
x0 = s0 + s3 + s5;
x1 = s1 - s4 - s6;
x2 = sinpi_3_9 * s7;
x3 = s2;
s0 = x0 + x3;
s1 = x1 + x3;
s2 = x2;
s3 = x0 + x1 - x3;
s0 = s0 + s3 + s5;
s1 = s1 - s4 - s6;
s3 = s2;
s2 = sinpi_3_9 * s7;
// 1-D transform scaling factor is sqrt(2).
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
}
void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -1545,7 +1510,7 @@ void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
}
}
static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step[4];
tran_high_t temp1, temp2;
(void) bd;
@@ -1576,7 +1541,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
// Rows
for (i = 0; i < 4; ++i) {
highbd_idct4(input, outptr, bd);
vp9_highbd_idct4(input, outptr, bd);
input += 4;
outptr += 4;
}
@@ -1585,7 +1550,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
highbd_idct4(temp_in, temp_out, bd);
vp9_highbd_idct4(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
@@ -1612,7 +1577,7 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step1[8], step2[8];
tran_high_t temp1, temp2;
// stage 1
@@ -1630,7 +1595,7 @@ static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
// stage 2 & stage 3 - even half
highbd_idct4(step1, step1, bd);
vp9_highbd_idct4(step1, step1, bd);
// stage 2 - odd half
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
@@ -1667,7 +1632,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows.
for (i = 0; i < 8; ++i) {
highbd_idct8(input, outptr, bd);
vp9_highbd_idct8(input, outptr, bd);
input += 8;
outptr += 8;
}
@@ -1676,7 +1641,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
highbd_idct8(temp_in, temp_out, bd);
vp9_highbd_idct8(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1702,10 +1667,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
tran_high_t x0 = input[0];
tran_high_t x1 = input[1];
tran_high_t x2 = input[2];
tran_high_t x3 = input[3];
tran_low_t x0 = input[0];
tran_low_t x1 = input[1];
tran_low_t x2 = input[2];
tran_low_t x3 = input[3];
(void) bd;
if (!(x0 | x1 | x2 | x3)) {
@@ -1720,34 +1685,29 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
s4 = sinpi_1_9 * x2;
s5 = sinpi_2_9 * x3;
s6 = sinpi_4_9 * x3;
s7 = x0 - x2 + x3;
s7 = (tran_high_t)(x0 - x2 + x3);
x0 = s0 + s3 + s5;
x1 = s1 - s4 - s6;
x2 = sinpi_3_9 * s7;
x3 = s2;
s0 = x0 + x3;
s1 = x1 + x3;
s2 = x2;
s3 = x0 + x1 - x3;
s0 = s0 + s3 + s5;
s1 = s1 - s4 - s6;
s3 = s2;
s2 = sinpi_3_9 * s7;
// 1-D transform scaling factor is sqrt(2).
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd);
output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd);
output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
}
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int tx_type, int bd) {
const highbd_transform_2d IHT_4[] = {
{ highbd_idct4, highbd_idct4 }, // DCT_DCT = 0
{ highbd_iadst4, highbd_idct4 }, // ADST_DCT = 1
{ highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
{ vp9_highbd_idct4, vp9_highbd_idct4 }, // DCT_DCT = 0
{ highbd_iadst4, vp9_highbd_idct4 }, // ADST_DCT = 1
{ vp9_highbd_idct4, highbd_iadst4 }, // DCT_ADST = 2
{ highbd_iadst4, highbd_iadst4 } // ADST_ADST = 3
};
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1779,14 +1739,14 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
tran_high_t x0 = input[7];
tran_high_t x1 = input[0];
tran_high_t x2 = input[5];
tran_high_t x3 = input[2];
tran_high_t x4 = input[3];
tran_high_t x5 = input[4];
tran_high_t x6 = input[1];
tran_high_t x7 = input[6];
tran_low_t x0 = input[7];
tran_low_t x1 = input[0];
tran_low_t x2 = input[5];
tran_low_t x3 = input[2];
tran_low_t x4 = input[3];
tran_low_t x5 = input[4];
tran_low_t x6 = input[1];
tran_low_t x7 = input[6];
(void) bd;
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
@@ -1854,9 +1814,9 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
}
static const highbd_transform_2d HIGH_IHT_8[] = {
{ highbd_idct8, highbd_idct8 }, // DCT_DCT = 0
{ highbd_iadst8, highbd_idct8 }, // ADST_DCT = 1
{ highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
{ vp9_highbd_idct8, vp9_highbd_idct8 }, // DCT_DCT = 0
{ highbd_iadst8, vp9_highbd_idct8 }, // ADST_DCT = 1
{ vp9_highbd_idct8, highbd_iadst8 }, // DCT_ADST = 2
{ highbd_iadst8, highbd_iadst8 } // ADST_ADST = 3
};
@@ -1899,7 +1859,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows.
// Only first 4 row has non-zero coefs.
for (i = 0; i < 4; ++i) {
highbd_idct8(input, outptr, bd);
vp9_highbd_idct8(input, outptr, bd);
input += 8;
outptr += 8;
}
@@ -1907,7 +1867,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
highbd_idct8(temp_in, temp_out, bd);
vp9_highbd_idct8(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1915,7 +1875,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step1[16], step2[16];
tran_high_t temp1, temp2;
(void) bd;
@@ -2091,7 +2051,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows.
for (i = 0; i < 16; ++i) {
highbd_idct16(input, outptr, bd);
vp9_highbd_idct16(input, outptr, bd);
input += 16;
outptr += 16;
}
@@ -2100,7 +2060,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
highbd_idct16(temp_in, temp_out, bd);
vp9_highbd_idct16(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2113,22 +2073,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
tran_high_t s9, s10, s11, s12, s13, s14, s15;
tran_high_t x0 = input[15];
tran_high_t x1 = input[0];
tran_high_t x2 = input[13];
tran_high_t x3 = input[2];
tran_high_t x4 = input[11];
tran_high_t x5 = input[4];
tran_high_t x6 = input[9];
tran_high_t x7 = input[6];
tran_high_t x8 = input[7];
tran_high_t x9 = input[8];
tran_high_t x10 = input[5];
tran_high_t x11 = input[10];
tran_high_t x12 = input[3];
tran_high_t x13 = input[12];
tran_high_t x14 = input[1];
tran_high_t x15 = input[14];
tran_low_t x0 = input[15];
tran_low_t x1 = input[0];
tran_low_t x2 = input[13];
tran_low_t x3 = input[2];
tran_low_t x4 = input[11];
tran_low_t x5 = input[4];
tran_low_t x6 = input[9];
tran_low_t x7 = input[6];
tran_low_t x8 = input[7];
tran_low_t x9 = input[8];
tran_low_t x10 = input[5];
tran_low_t x11 = input[10];
tran_low_t x12 = input[3];
tran_low_t x13 = input[12];
tran_low_t x14 = input[1];
tran_low_t x15 = input[14];
(void) bd;
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
@@ -2280,9 +2240,9 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
}
static const highbd_transform_2d HIGH_IHT_16[] = {
{ highbd_idct16, highbd_idct16 }, // DCT_DCT = 0
{ highbd_iadst16, highbd_idct16 }, // ADST_DCT = 1
{ highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
{ vp9_highbd_idct16, vp9_highbd_idct16 }, // DCT_DCT = 0
{ highbd_iadst16, vp9_highbd_idct16 }, // ADST_DCT = 1
{ vp9_highbd_idct16, highbd_iadst16 }, // DCT_ADST = 2
{ highbd_iadst16, highbd_iadst16 } // ADST_ADST = 3
};
@@ -2325,7 +2285,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
for (i = 0; i < 4; ++i) {
highbd_idct16(input, outptr, bd);
vp9_highbd_idct16(input, outptr, bd);
input += 16;
outptr += 16;
}
@@ -2334,7 +2294,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j*16 + i];
highbd_idct16(temp_in, temp_out, bd);
vp9_highbd_idct16(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = highbd_clip_pixel_add(
dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

View File

@@ -116,6 +116,28 @@ typedef struct {
} highbd_transform_2d;
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EMULATE_HARDWARE
// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
// non-normative method to handle overflows. A stream that causes
// overflows in the inverse transform is considered invalid in VP9,
// and a hardware implementer is free to choose any reasonable
// method to handle overflows. However to aid in hardware
// verification they can use a specific implementation of the
// WRAPLOW() macro below that is identical to their intended
// hardware implementation (and also use configure options to trigger
// the C-implementation of the transform).
//
// The particular WRAPLOW implementation below performs strict
// overflow wrapping to match common hardware implementations.
// bd of 8 uses trans_low with 16bits, need to remove 16bits
// bd of 10 uses trans_low with 18bits, need to remove 14bits
// bd of 12 uses trans_low with 20bits, need to remove 12bits
// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
#else
#define WRAPLOW(x, bd) (x)
#endif // CONFIG_EMULATE_HARDWARE
void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob);
void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -135,6 +157,9 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob, int bd);
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -151,6 +176,11 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
int bd) {
trans = WRAPLOW(trans, bd);
return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"

View File

@@ -750,27 +750,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_1_add/;
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_16_add/;
add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_1_add/;
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_64_add/;
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_10_add/;
add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_1_add/;
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_256_add/;
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_10_add/;
add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct32x32_1024_add/;
@@ -796,6 +781,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_iwht4x4_16_add/;
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_16_add/;
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_64_add/;
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_10_add/;
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_256_add/;
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_10_add/;
} else {
add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct4x4_16_add sse2/;
add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_64_add sse2/;
add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct8x8_10_add sse2/;
add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_256_add sse2/;
add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vp9_highbd_idct16x16_10_add sse2/;
}
}
#
@@ -1114,6 +1135,11 @@ specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_highbd_avg_8x8/;
}
# ENCODEMB INVOKE
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
@@ -1176,43 +1202,43 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht4x4/;
specialize qw/vp9_fht4x4 sse2/;
add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht8x8/;
specialize qw/vp9_fht8x8 sse2/;
add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht16x16/;
specialize qw/vp9_fht16x16 sse2/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fwht4x4/;
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct4x4_1/;
specialize qw/vp9_fdct4x4_1 sse2/;
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct4x4/;
specialize qw/vp9_fdct4x4 sse2/;
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct8x8_1/;
specialize qw/vp9_fdct8x8_1 sse2/;
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct8x8/;
specialize qw/vp9_fdct8x8 sse2/;
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct16x16_1/;
specialize qw/vp9_fdct16x16_1 sse2/;
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct16x16/;
specialize qw/vp9_fdct16x16 sse2/;
add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32_1/;
specialize qw/vp9_fdct32x32_1 sse2/;
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32/;
specialize qw/vp9_fdct32x32 sse2/;
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fdct32x32_rd/;
specialize qw/vp9_fdct32x32_rd sse2/;
} else {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht4x4 sse2/;
@@ -1278,34 +1304,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# variance
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x16/;
specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x32/;
specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x32/;
specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x64/;
specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance32x32/;
specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance64x64/;
specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x16/;
specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance16x8/;
specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x16/;
specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x8/;
specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_variance8x4/;
@@ -1317,40 +1343,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_variance4x4/;
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get8x8var/;
specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_get16x16var/;
specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x16/;
specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x32/;
specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x32/;
specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x64/;
specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance32x32/;
specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance64x64/;
specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x16/;
specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance16x8/;
specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x16/;
specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x8/;
specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_variance8x4/;
@@ -1362,40 +1388,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_variance4x4/;
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get8x8var/;
specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_10_get16x16var/;
specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x16/;
specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x32/;
specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x32/;
specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x64/;
specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance32x32/;
specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance64x64/;
specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x16/;
specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance16x8/;
specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x16/;
specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x8/;
specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_variance8x4/;
@@ -1407,76 +1433,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_variance4x4/;
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get8x8var/;
specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_highbd_12_get16x16var/;
specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@@ -1491,70 +1517,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@@ -1569,70 +1595,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@@ -1647,37 +1673,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad64x64/;
specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x64/;
specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad64x32/;
specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x16/;
specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x32/;
specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad32x32/;
specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x16/;
specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad16x8/;
specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x16/;
specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x8/;
specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad8x4/;
specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad4x8/;
@@ -1686,37 +1712,37 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sad4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad64x64_avg/;
specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x64_avg/;
specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad64x32_avg/;
specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x16_avg/;
specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x32_avg/;
specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad32x32_avg/;
specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x16_avg/;
specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad16x8_avg/;
specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x16_avg/;
specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x8_avg/;
specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad8x4_avg/;
specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad4x8_avg/;
@@ -1773,47 +1799,46 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_sad4x4x8/;
add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad64x64x4d/;
specialize qw/vp9_highbd_sad64x64x4d sse2/;
add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x64x4d/;
specialize qw/vp9_highbd_sad32x64x4d sse2/;
add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad64x32x4d/;
specialize qw/vp9_highbd_sad64x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x16x4d/;
specialize qw/vp9_highbd_sad32x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x32x4d/;
specialize qw/vp9_highbd_sad16x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad32x32x4d/;
specialize qw/vp9_highbd_sad32x32x4d sse2/;
add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x16x4d/;
specialize qw/vp9_highbd_sad16x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad16x8x4d/;
specialize qw/vp9_highbd_sad16x8x4d sse2/;
add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x16x4d/;
specialize qw/vp9_highbd_sad8x16x4d sse2/;
add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x8x4d/;
specialize qw/vp9_highbd_sad8x8x4d sse2/;
# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad8x4x4d/;
specialize qw/vp9_highbd_sad8x4x4d sse2/;
add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad4x8x4d/;
specialize qw/vp9_highbd_sad4x8x4d sse2/;
add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_highbd_sad4x4x4d/;
specialize qw/vp9_highbd_sad4x4x4d sse2/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x16/;
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x16/;
@@ -1822,10 +1847,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_mse16x8/;
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse8x8/;
specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse16x16/;
specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x16/;
@@ -1834,10 +1859,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_10_mse16x8/;
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_10_mse8x8/;
specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse16x16/;
specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x16/;
@@ -1846,12 +1871,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_12_mse16x8/;
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_12_mse8x8/;
specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
# ENCODEMB INVOKE
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error/;
specialize qw/vp9_highbd_block_error sse2/;
add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
specialize qw/vp9_highbd_subtract_block/;
@@ -1863,10 +1888,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vp9_highbd_quantize_fp_32x32/;
add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b/;
specialize qw/vp9_highbd_quantize_b sse2/;
add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_b_32x32/;
specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
#
# Structured Similarity (SSIM)
@@ -1878,40 +1903,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# fdct functions
add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_highbd_fht4x4/;
specialize qw/vp9_highbd_fht4x4 sse2/;
add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_highbd_fht8x8/;
specialize qw/vp9_highbd_fht8x8 sse2/;
add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_highbd_fht16x16/;
specialize qw/vp9_highbd_fht16x16 sse2/;
add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fwht4x4/;
add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct4x4/;
specialize qw/vp9_highbd_fdct4x4 sse2/;
add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct8x8_1/;
add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct8x8/;
specialize qw/vp9_highbd_fdct8x8 sse2/;
add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct16x16_1/;
add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct16x16/;
specialize qw/vp9_highbd_fdct16x16 sse2/;
add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct32x32_1/;
add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct32x32/;
specialize qw/vp9_highbd_fdct32x32 sse2/;
add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_highbd_fdct32x32_rd/;
specialize qw/vp9_highbd_fdct32x32_rd sse2/;
add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_highbd_temporal_filter_apply/;

View File

@@ -14,6 +14,10 @@
#include "vp9/common/vp9_loopfilter.h"
#include "vpx_ports/emmintrin_compat.h"
static INLINE __m128i highbd_abs_diff(__m128i a, __m128i b) {
return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
}
static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
__m128i ubounded;
__m128i lbounded;
@@ -35,8 +39,126 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
return retval;
}
// TODO(debargha, peter): Break up large functions into smaller ones
// in this file.
static INLINE void get_hev_and_mask(const __m128i thresh, const __m128i limit,
const __m128i blimit, const __m128i zero,
const __m128i one, const __m128i ffff,
__m128i abs_p1p0, __m128i abs_q1q0,
__m128i abs_p0q0, __m128i abs_p1q1,
__m128i abs_p2p1, __m128i abs_q2q1,
__m128i abs_p3p2, __m128i abs_q3q2,
__m128i* hev, __m128i* mask) {
__m128i work0, work1, work2;
// highbd_hev_mask
work0 = _mm_max_epi16(abs_p1p0, abs_q1q0);
*hev = _mm_subs_epu16(work0, thresh);
*hev = _mm_xor_si128(_mm_cmpeq_epi16(*hev, zero), ffff);
work1 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
work2 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
*mask = _mm_subs_epu16(_mm_adds_epu16(work1, work2), blimit);
*mask = _mm_xor_si128(_mm_cmpeq_epi16(*mask, zero), ffff);
*mask = _mm_and_si128(*mask, _mm_adds_epu16(limit, one));
*mask = _mm_max_epi16(work0, *mask);
work0 = _mm_max_epi16(abs_p2p1, abs_q2q1);
*mask = _mm_max_epi16(work0, *mask);
work0 = _mm_max_epi16(abs_p3p2, abs_q3q2);
*mask = _mm_max_epi16(work0, *mask);
*mask = _mm_subs_epu16(*mask, limit);
*mask = _mm_cmpeq_epi16(*mask, zero); // return ~mask
}
static INLINE void highbd_filter4_sse2(const __m128i mask, const __m128i hev,
const __m128i p1, const __m128i p0,
const __m128i q1, const __m128i q0,
__m128i *ps1, __m128i *ps0,
__m128i *qs0, __m128i *qs1,
int bd) {
const __m128i t4 = _mm_set1_epi16(4);
const __m128i t3 = _mm_set1_epi16(3);
const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
const __m128i t1 = _mm_set1_epi16(0x1);
__m128i filt, work, filter1, filter2;
*ps1 = _mm_subs_epi16(p1, t80);
*qs1 = _mm_subs_epi16(q1, t80);
*ps0 = _mm_subs_epi16(p0, t80);
*qs0 = _mm_subs_epi16(q0, t80);
filt = _mm_and_si128(
signed_char_clamp_bd_sse2(_mm_subs_epi16(*ps1, *qs1), bd), hev);
work = _mm_subs_epi16(*qs0, *ps0);
filt = _mm_adds_epi16(filt, work);
filt = _mm_adds_epi16(filt, work);
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work), bd);
filt = _mm_and_si128(filt, mask);
filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
// Filter1 >> 3
filter1 = _mm_srai_epi16(filter1, 0x3);
filter2 = _mm_srai_epi16(filter2, 0x3);
*qs0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(*qs0, filter1), bd),
t80);
*ps0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(*ps0, filter2), bd),
t80);
filt = _mm_adds_epi16(filter1, t1);
filt = _mm_srai_epi16(filt, 1);
filt = _mm_andnot_si128(hev, filt);
*qs1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(*qs1, filt), bd),
t80);
*ps1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(*ps1, filt), bd),
t80);
}
static INLINE void apply_7tap_filter(const __m128i p3, const __m128i p2,
const __m128i p1, const __m128i p0,
const __m128i q0, const __m128i q1,
const __m128i q2, const __m128i q3,
uint16_t* flat_op2, uint16_t* flat_op1,
uint16_t* flat_op0, uint16_t* flat_oq0,
uint16_t* flat_oq1, uint16_t* flat_oq2) {
__m128i workp_a, workp_b, workp_shft;
const __m128i four = _mm_set1_epi16(4);
workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)flat_op2, workp_shft);
workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)flat_op1, workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)flat_op0, workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)flat_oq0, workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)flat_oq1, workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)flat_oq2, workp_shft);
}
static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
int p,
const uint8_t *_blimit,
@@ -45,6 +167,7 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
int bd) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
const __m128i ffff = _mm_cmpeq_epi16(zero, zero);
const __m128i blimit = _mm_slli_epi16(
_mm_unpacklo_epi8(
_mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
@@ -56,8 +179,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
__m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
__m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
__m128i ps1, qs1, ps0, qs0;
__m128i abs_p0q0, abs_p1q1, ffff, work;
__m128i filt, work_a, filter1, filter2;
__m128i abs_p0q0, abs_p1q1, work;
__m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
__m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
__m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
__m128i flat2_q0, flat2_p0;
@@ -65,7 +188,6 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
__m128i pixelFilter_p, pixelFilter_q;
__m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
__m128i sum_p7, sum_q7, sum_p3, sum_q3;
__m128i t4, t3, t80, t1;
__m128i eight, four;
q4 = _mm_load_si128((__m128i *)(s + 4 * p));
@@ -80,98 +202,25 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
p0 = _mm_load_si128((__m128i *)(s - 1 * p));
// highbd_filter_mask
abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
abs_p1p0 = highbd_abs_diff(p1, p0);
abs_q1q0 = highbd_abs_diff(q1, q0);
abs_p0q0 = highbd_abs_diff(p0, q0);
abs_p1q1 = highbd_abs_diff(p1, q1);
abs_p2p1 = highbd_abs_diff(p2, p1);
abs_q2q1 = highbd_abs_diff(q2, q1);
abs_p3p2 = highbd_abs_diff(p3, p2);
abs_q3q2 = highbd_abs_diff(q3, q2);
ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
// highbd_hev_mask (in C code this is actually called from highbd_filter4)
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu16(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
_mm_subs_epu16(p0, p1)),
_mm_or_si128(_mm_subs_epu16(q1, q0),
_mm_subs_epu16(q0, q1)));
mask = _mm_max_epi16(work, mask);
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
_mm_subs_epu16(p1, p2)),
_mm_or_si128(_mm_subs_epu16(q2, q1),
_mm_subs_epu16(q1, q2)));
mask = _mm_max_epi16(work, mask);
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
_mm_subs_epu16(p2, p3)),
_mm_or_si128(_mm_subs_epu16(q3, q2),
_mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
mask = _mm_subs_epu16(mask, limit);
mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
&hev, &mask);
// lp filter
// highbd_filter4
t4 = _mm_set1_epi16(4);
t3 = _mm_set1_epi16(3);
t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
t1 = _mm_set1_epi16(0x1);
ps1 = _mm_subs_epi16(p1, t80);
qs1 = _mm_subs_epi16(q1, t80);
ps0 = _mm_subs_epi16(p0, t80);
qs0 = _mm_subs_epi16(q0, t80);
filt = _mm_and_si128(
signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
work_a = _mm_subs_epi16(qs0, ps0);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
filt = _mm_and_si128(filt, mask);
filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
// Filter1 >> 3
filter1 = _mm_srai_epi16(filter1, 0x3);
filter2 = _mm_srai_epi16(filter2, 0x3);
qs0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
t80);
ps0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
t80);
filt = _mm_adds_epi16(filter1, t1);
filt = _mm_srai_epi16(filt, 1);
filt = _mm_andnot_si128(hev, filt);
qs1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
t80);
ps1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
t80);
// end highbd_filter4
// loopfilter done
highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);
// highbd_flat_mask4
flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
_mm_subs_epu16(p0, p2)),
_mm_or_si128(_mm_subs_epu16(p3, p0),
_mm_subs_epu16(p0, p3)));
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
_mm_subs_epu16(q0, q2)),
_mm_or_si128(_mm_subs_epu16(q3, q0),
_mm_subs_epu16(q0, q3)));
flat = _mm_max_epi16(highbd_abs_diff(p2, p0), highbd_abs_diff(p3, p0));
work = _mm_max_epi16(highbd_abs_diff(q2, q0), highbd_abs_diff(q3, q0));
flat = _mm_max_epi16(work, flat);
work = _mm_max_epi16(abs_p1p0, abs_q1q0);
flat = _mm_max_epi16(work, flat);
@@ -192,27 +241,15 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
// highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
// but referred to as p0-p4 & q0-q4 in fn)
flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
_mm_subs_epu16(p0, p4)),
_mm_or_si128(_mm_subs_epu16(q4, q0),
_mm_subs_epu16(q0, q4)));
flat2 = _mm_max_epi16(highbd_abs_diff(p4, p0), highbd_abs_diff(q4, q0));
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
_mm_subs_epu16(p0, p5)),
_mm_or_si128(_mm_subs_epu16(q5, q0),
_mm_subs_epu16(q0, q5)));
work = _mm_max_epi16(highbd_abs_diff(p5, p0), highbd_abs_diff(q5, q0));
flat2 = _mm_max_epi16(work, flat2);
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
_mm_subs_epu16(p0, p6)),
_mm_or_si128(_mm_subs_epu16(q6, q0),
_mm_subs_epu16(q0, q6)));
work = _mm_max_epi16(highbd_abs_diff(p6, p0), highbd_abs_diff(q6, q0));
flat2 = _mm_max_epi16(work, flat2);
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
_mm_subs_epu16(p0, p7)),
_mm_or_si128(_mm_subs_epu16(q7, q0),
_mm_subs_epu16(q0, q7)));
work = _mm_max_epi16(highbd_abs_diff(p7, p0), highbd_abs_diff(q7, q0));
flat2 = _mm_max_epi16(work, flat2);
flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
@@ -225,10 +262,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
eight = _mm_set1_epi16(8);
four = _mm_set1_epi16(4);
pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
_mm_add_epi16(p4, p3));
pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
_mm_add_epi16(q4, q3));
pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
@@ -237,9 +272,8 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
pixelFilter_q));
pixetFilter_p2p1p0 = _mm_add_epi16(four,
_mm_add_epi16(pixetFilter_p2p1p0,
pixetFilter_q2q1q0));
pixetFilter_p2p1p0 = _mm_add_epi16(four, _mm_add_epi16(pixetFilter_p2p1p0,
pixetFilter_q2q1q0));
flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
_mm_add_epi16(p7, p0)), 4);
flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
@@ -486,6 +520,8 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
const __m128i ffff = _mm_cmpeq_epi16(one, one);
const __m128i blimit = _mm_slli_epi16(
_mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
bd - 8);
@@ -504,74 +540,30 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
__m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
__m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
__m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
const __m128i one = _mm_set1_epi16(1);
const __m128i ffff = _mm_cmpeq_epi16(one, one);
__m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
const __m128i four = _mm_set1_epi16(4);
__m128i workp_a, workp_b, workp_shft;
const __m128i t4 = _mm_set1_epi16(4);
const __m128i t3 = _mm_set1_epi16(3);
const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
const __m128i t1 = _mm_set1_epi16(0x1);
const __m128i ps1 = _mm_subs_epi16(p1, t80);
const __m128i ps0 = _mm_subs_epi16(p0, t80);
const __m128i qs0 = _mm_subs_epi16(q0, t80);
const __m128i qs1 = _mm_subs_epi16(q1, t80);
__m128i filt;
__m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
__m128i ps0, ps1, qs0, qs1;
__m128i work_a;
__m128i filter1, filter2;
(void)count;
// filter_mask and hev_mask
abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
_mm_subs_epu16(p0, p1));
abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
_mm_subs_epu16(q0, q1));
abs_p1p0 = highbd_abs_diff(p1, p0);
abs_q1q0 = highbd_abs_diff(q1, q0);
abs_p0q0 = highbd_abs_diff(p0, q0);
abs_p1q1 = highbd_abs_diff(p1, q1);
abs_p2p1 = highbd_abs_diff(p2, p1);
abs_q2q1 = highbd_abs_diff(q2, q1);
abs_p3p2 = highbd_abs_diff(p3, p2);
abs_q3q2 = highbd_abs_diff(q3, q2);
abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
_mm_subs_epu16(q0, p0));
abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
_mm_subs_epu16(q1, p1));
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu16(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
// So taking maximums continues to work:
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
mask = _mm_max_epi16(abs_p1p0, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
mask = _mm_max_epi16(abs_q1q0, mask);
// mask |= (abs(q1 - q0) > limit) * -1;
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
_mm_subs_epu16(p1, p2)),
_mm_or_si128(_mm_subs_epu16(q2, q1),
_mm_subs_epu16(q1, q2)));
mask = _mm_max_epi16(work, mask);
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
_mm_subs_epu16(p2, p3)),
_mm_or_si128(_mm_subs_epu16(q3, q2),
_mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
mask = _mm_subs_epu16(mask, limit);
mask = _mm_cmpeq_epi16(mask, zero);
get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
&hev, &mask);
// flat_mask4
flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
_mm_subs_epu16(p0, p2)),
_mm_or_si128(_mm_subs_epu16(q2, q0),
_mm_subs_epu16(q0, q2)));
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
_mm_subs_epu16(p0, p3)),
_mm_or_si128(_mm_subs_epu16(q3, q0),
_mm_subs_epu16(q0, q3)));
flat = _mm_max_epi16(highbd_abs_diff(p2, p0), highbd_abs_diff(q2, q0));
work = _mm_max_epi16(highbd_abs_diff(p3, p0), highbd_abs_diff(q3, q0));
flat = _mm_max_epi16(work, flat);
flat = _mm_max_epi16(abs_p1p0, flat);
flat = _mm_max_epi16(abs_q1q0, flat);
@@ -579,77 +571,20 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
flat = _mm_cmpeq_epi16(flat, zero);
flat = _mm_and_si128(flat, mask); // flat & mask
// Added before shift for rounding part of ROUND_POWER_OF_TWO
workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
// Apply 7-tap filter (result used if flat && mask) c.f. highbd_filter8
apply_7tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, &flat_op2[0], &flat_op1[0],
&flat_op0[0], &flat_oq0[0], &flat_oq1[0], &flat_oq2[0]);
// lp filter
filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
filt = _mm_and_si128(filt, hev);
work_a = _mm_subs_epi16(qs0, ps0);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
// (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = signed_char_clamp_bd_sse2(filt, bd);
filt = _mm_and_si128(filt, mask);
highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);
filter1 = _mm_adds_epi16(filt, t4);
filter2 = _mm_adds_epi16(filt, t3);
// Filter1 >> 3
filter1 = signed_char_clamp_bd_sse2(filter1, bd);
filter1 = _mm_srai_epi16(filter1, 3);
// Filter2 >> 3
filter2 = signed_char_clamp_bd_sse2(filter2, bd);
filter2 = _mm_srai_epi16(filter2, 3);
// filt >> 1
filt = _mm_adds_epi16(filter1, t1);
filt = _mm_srai_epi16(filt, 1);
// filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
filt = _mm_andnot_si128(hev, filt);
work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
work_a = _mm_adds_epi16(work_a, t80);
q0 = _mm_load_si128((__m128i *)flat_oq0);
work_a = _mm_andnot_si128(flat, work_a);
work_a = _mm_andnot_si128(flat, qs0);
q0 = _mm_and_si128(flat, q0);
q0 = _mm_or_si128(work_a, q0);
work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
work_a = _mm_adds_epi16(work_a, t80);
q1 = _mm_load_si128((__m128i *)flat_oq1);
work_a = _mm_andnot_si128(flat, work_a);
work_a = _mm_andnot_si128(flat, qs1);
q1 = _mm_and_si128(flat, q1);
q1 = _mm_or_si128(work_a, q1);
@@ -659,17 +594,13 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
q2 = _mm_and_si128(flat, q2);
q2 = _mm_or_si128(work_a, q2);
work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
work_a = _mm_adds_epi16(work_a, t80);
p0 = _mm_load_si128((__m128i *)flat_op0);
work_a = _mm_andnot_si128(flat, work_a);
work_a = _mm_andnot_si128(flat, ps0);
p0 = _mm_and_si128(flat, p0);
p0 = _mm_or_si128(work_a, p0);
work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
work_a = _mm_adds_epi16(work_a, t80);
p1 = _mm_load_si128((__m128i *)flat_op1);
work_a = _mm_andnot_si128(flat, work_a);
work_a = _mm_andnot_si128(flat, ps1);
p1 = _mm_and_si128(flat, p1);
p1 = _mm_or_si128(work_a, p1);
@@ -715,7 +646,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
const __m128i thresh = _mm_slli_epi16(
_mm_unpacklo_epi8(
_mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
__m128i mask, hev, flat;
__m128i mask, hev;
__m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
__m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
__m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
@@ -724,121 +655,36 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
__m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
__m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
__m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
_mm_subs_epu16(p0, p1));
const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
_mm_subs_epu16(q0, q1));
const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
const __m128i ffff = _mm_cmpeq_epi16(zero, zero);
const __m128i one = _mm_set1_epi16(1);
__m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
_mm_subs_epu16(q0, p0));
__m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
_mm_subs_epu16(q1, p1));
__m128i work;
const __m128i t4 = _mm_set1_epi16(4);
const __m128i t3 = _mm_set1_epi16(3);
const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
// equivalent to shifting 0x1f left by bitdepth - 8
// and setting new bits to 1
const __m128i t1 = _mm_set1_epi16(0x1);
const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
// equivalent to shifting 0x7f left by bitdepth - 8
// and setting new bits to 1
const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
t80);
const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
t80);
const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
t80);
const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
__m128i ps1, ps0, qs0, qs1;
__m128i abs_p1p0, abs_q1q0, abs_p0q0, abs_p1q1;
__m128i abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2;
(void)count;
// filter_mask and hev_mask
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu16(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
abs_p1p0 = highbd_abs_diff(p1, p0);
abs_q1q0 = highbd_abs_diff(q1, q0);
abs_p0q0 = highbd_abs_diff(p0, q0);
abs_p1q1 = highbd_abs_diff(p1, q1);
abs_p2p1 = highbd_abs_diff(p2, p1);
abs_q2q1 = highbd_abs_diff(q2, q1);
abs_p3p2 = highbd_abs_diff(p3, p2);
abs_q3q2 = highbd_abs_diff(q3, q2);
abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
// So taking maximums continues to work:
mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
mask = _mm_max_epi16(flat, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
_mm_subs_epu16(p1, p2)),
_mm_or_si128(_mm_subs_epu16(p3, p2),
_mm_subs_epu16(p2, p3)));
mask = _mm_max_epi16(work, mask);
work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
_mm_subs_epu16(q1, q2)),
_mm_or_si128(_mm_subs_epu16(q3, q2),
_mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
mask = _mm_subs_epu16(mask, limit);
mask = _mm_cmpeq_epi16(mask, zero);
get_hev_and_mask(thresh, limit, blimit, zero, one, ffff, abs_p1p0, abs_q1q0,
abs_p0q0, abs_p1q1, abs_p2p1, abs_q2q1, abs_p3p2, abs_q3q2,
&hev, &mask);
// filter4
filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
filt = _mm_and_si128(filt, hev);
work_a = _mm_subs_epi16(qs0, ps0);
filt = _mm_adds_epi16(filt, work_a);
filt = _mm_adds_epi16(filt, work_a);
filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
// (vp9_filter + 3 * (qs0 - ps0)) & mask
filt = _mm_and_si128(filt, mask);
highbd_filter4_sse2(mask, hev, p1, p0, q1, q0, &ps1, &ps0, &qs0, &qs1, bd);
filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
// Filter1 >> 3
work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
filter1 = _mm_srli_epi16(filter1, 3);
work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
filter1 = _mm_and_si128(filter1, t1f); // clamp the range
filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
// Filter2 >> 3
work_a = _mm_cmpgt_epi16(zero, filter2);
filter2 = _mm_srli_epi16(filter2, 3);
work_a = _mm_and_si128(work_a, tffe0);
filter2 = _mm_and_si128(filter2, t1f);
filter2 = _mm_or_si128(filter2, work_a);
// filt >> 1
filt = _mm_adds_epi16(filter1, t1);
work_a = _mm_cmpgt_epi16(zero, filt);
filt = _mm_srli_epi16(filt, 1);
work_a = _mm_and_si128(work_a, tff80);
filt = _mm_and_si128(filt, t7f);
filt = _mm_or_si128(filt, work_a);
filt = _mm_andnot_si128(hev, filt);
q0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
q1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
p0 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
p1 = _mm_adds_epi16(
signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
_mm_storeu_si128((__m128i *)(s - 2 * p), p1);
_mm_storeu_si128((__m128i *)(s - 1 * p), p0);
_mm_storeu_si128((__m128i *)(s + 0 * p), q0);
_mm_storeu_si128((__m128i *)(s + 1 * p), q1);
_mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
_mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
_mm_storeu_si128((__m128i *)(s + 0 * p), qs0);
_mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
}
void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,

View File

@@ -19,42 +19,34 @@
mov rcx, 0x00000040
movdqa xmm7, [rdx] ;load filters
pshuflw xmm0, xmm7, 0b ;k0
pshuflw xmm1, xmm7, 01010101b ;k1
pshuflw xmm2, xmm7, 10101010b ;k2
pshuflw xmm3, xmm7, 11111111b ;k3
pshuflw xmm10, xmm7, 0b ;k0
pshuflw xmm11, xmm7, 01010101b ;k1
pshuflw xmm12, xmm7, 10101010b ;k2
pshuflw xmm13, xmm7, 11111111b ;k3
psrldq xmm7, 8
pshuflw xmm4, xmm7, 0b ;k4
pshuflw xmm5, xmm7, 01010101b ;k5
pshuflw xmm6, xmm7, 10101010b ;k6
pshuflw xmm7, xmm7, 11111111b ;k7
punpcklwd xmm0, xmm6
punpcklwd xmm2, xmm5
punpcklwd xmm3, xmm4
punpcklwd xmm1, xmm7
punpcklwd xmm10, xmm6
punpcklwd xmm12, xmm5
punpcklwd xmm13, xmm4
punpcklwd xmm11, xmm7
movdqa k0k6, xmm0
movdqa k2k5, xmm2
movdqa k3k4, xmm3
movdqa k1k7, xmm1
movq xmm6, rcx
pshufd xmm6, xmm6, 0
movdqa krd, xmm6
movq xmm9, rcx
pshufd xmm9, xmm9, 0
;Compute max and min values of a pixel
mov rdx, 0x00010001
movsxd rcx, DWORD PTR arg(6) ;bps
movq xmm0, rdx
movq xmm14, rdx
movq xmm1, rcx
pshufd xmm0, xmm0, 0b
movdqa xmm2, xmm0
psllw xmm0, xmm1
psubw xmm0, xmm2
pxor xmm1, xmm1
movdqa max, xmm0 ;max value (for clamping)
movdqa min, xmm1 ;min value (for clamping)
pshufd xmm14, xmm14, 0b
movdqa xmm2, xmm14
psllw xmm14, xmm1
psubw xmm14, xmm2 ;max value (for clamping)
pxor xmm8, xmm8 ;min value (for clamping)
%endm
@@ -64,22 +56,22 @@
punpcklwd xmm2, xmm5
punpcklwd xmm3, xmm4
pmaddwd xmm0, k0k6 ;multiply the filter factors
pmaddwd xmm1, k1k7
pmaddwd xmm2, k2k5
pmaddwd xmm3, k3k4
pmaddwd xmm0, xmm10 ;multiply the filter factors
pmaddwd xmm1, xmm11
pmaddwd xmm2, xmm12
pmaddwd xmm3, xmm13
paddd xmm0, xmm1 ;sum
paddd xmm0, xmm2
paddd xmm0, xmm3
paddd xmm0, krd ;rounding
paddd xmm0, xmm9 ;rounding
psrad xmm0, 7 ;shift
packssdw xmm0, xmm0 ;pack to word
;clamp the values
pminsw xmm0, max
pmaxsw xmm0, min
pminsw xmm0, xmm14
pmaxsw xmm0, xmm8
%if %1
movq xmm1, [rdi]
@@ -95,42 +87,34 @@
mov rcx, 0x00000040
movdqa xmm7, [rdx] ;load filters
pshuflw xmm0, xmm7, 0b ;k0
pshuflw xmm10, xmm7, 0b ;k0
pshuflw xmm1, xmm7, 01010101b ;k1
pshuflw xmm2, xmm7, 10101010b ;k2
pshuflw xmm3, xmm7, 11111111b ;k3
pshuflw xmm12, xmm7, 10101010b ;k2
pshuflw xmm13, xmm7, 11111111b ;k3
pshufhw xmm4, xmm7, 0b ;k4
pshufhw xmm5, xmm7, 01010101b ;k5
pshufhw xmm6, xmm7, 10101010b ;k6
pshufhw xmm11, xmm7, 10101010b ;k6
pshufhw xmm7, xmm7, 11111111b ;k7
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
punpcklwd xmm0, xmm1
punpckhwd xmm6, xmm7
punpckhwd xmm2, xmm5
punpckhwd xmm3, xmm4
punpcklqdq xmm12, xmm12
punpcklqdq xmm13, xmm13
punpcklwd xmm10, xmm1
punpckhwd xmm11, xmm7
punpckhwd xmm12, xmm5
punpckhwd xmm13, xmm4
movdqa k0k1, xmm0 ;store filter factors on stack
movdqa k6k7, xmm6
movdqa k2k5, xmm2
movdqa k3k4, xmm3
movq xmm6, rcx
pshufd xmm6, xmm6, 0
movdqa krd, xmm6 ;rounding
movq xmm9, rcx
pshufd xmm9, xmm9, 0 ;rounding
;Compute max and min values of a pixel
mov rdx, 0x00010001
movsxd rcx, DWORD PTR arg(6) ;bps
movq xmm0, rdx
movq xmm14, rdx
movq xmm1, rcx
pshufd xmm0, xmm0, 0b
movdqa xmm2, xmm0
psllw xmm0, xmm1
psubw xmm0, xmm2
pxor xmm1, xmm1
movdqa max, xmm0 ;max value (for clamping)
movdqa min, xmm1 ;min value (for clamping)
pshufd xmm14, xmm14, 0b
movdqa xmm2, xmm14
psllw xmm14, xmm1
psubw xmm14, xmm2 ;max value (for clamping)
pxor xmm15, xmm15 ;min value (for clamping)
%endm
%macro LOAD_VERT_8 1
@@ -146,7 +130,7 @@
%endm
%macro HIGH_APPLY_FILTER_8 2
movdqu temp, xmm4
movdqa xmm8, xmm4
movdqa xmm4, xmm0
punpcklwd xmm0, xmm1
punpckhwd xmm4, xmm1
@@ -157,21 +141,21 @@
punpcklwd xmm2, xmm5
punpckhwd xmm7, xmm5
movdqu xmm5, temp
movdqu temp, xmm4
movdqa xmm5, xmm8
movdqa xmm8, xmm4
movdqa xmm4, xmm3
punpcklwd xmm3, xmm5
punpckhwd xmm4, xmm5
movdqu xmm5, temp
movdqa xmm5, xmm8
pmaddwd xmm0, k0k1
pmaddwd xmm5, k0k1
pmaddwd xmm6, k6k7
pmaddwd xmm1, k6k7
pmaddwd xmm2, k2k5
pmaddwd xmm7, k2k5
pmaddwd xmm3, k3k4
pmaddwd xmm4, k3k4
pmaddwd xmm0, xmm10
pmaddwd xmm5, xmm10
pmaddwd xmm6, xmm11
pmaddwd xmm1, xmm11
pmaddwd xmm2, xmm12
pmaddwd xmm7, xmm12
pmaddwd xmm3, xmm13
pmaddwd xmm4, xmm13
paddd xmm0, xmm6
paddd xmm0, xmm2
@@ -180,15 +164,15 @@
paddd xmm5, xmm7
paddd xmm5, xmm4
paddd xmm0, krd ;rounding
paddd xmm5, krd
paddd xmm0, xmm9 ;rounding
paddd xmm5, xmm9
psrad xmm0, 7 ;shift
psrad xmm5, 7
packssdw xmm0, xmm5 ;pack back to word
;clamp the values
pminsw xmm0, max
pmaxsw xmm0, min
pminsw xmm0, xmm14
pmaxsw xmm0, xmm15
%if %1
movdqu xmm1, [rdi + %2]
@@ -211,22 +195,12 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 14
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
@@ -256,8 +230,6 @@ sym(vp9_highbd_filter_block1d4_v8_sse2):
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
pop rbx
; begin epilog
pop rdi
@@ -281,23 +253,12 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -315,8 +276,6 @@ sym(vp9_highbd_filter_block1d8_v8_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
@@ -340,23 +299,12 @@ sym(vp9_highbd_filter_block1d16_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -378,8 +326,6 @@ sym(vp9_highbd_filter_block1d16_v8_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
@@ -394,22 +340,12 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 14
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
@@ -439,8 +375,6 @@ sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
pop rbx
; begin epilog
pop rdi
@@ -455,23 +389,12 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -488,8 +411,6 @@ sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
@@ -504,23 +425,12 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -541,8 +451,6 @@ sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
pop rbx
; begin epilog
pop rdi
@@ -566,21 +474,11 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 14
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
@@ -592,6 +490,16 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.load
prefetcht0 [rsi - 6]
prefetcht0 [rsi + 17]
lea rsi, [rsi + rax]
dec rcx
jnz .load
mov rsi, arg(0)
movsxd rcx, DWORD PTR arg(4)
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm4, [rsi + 2]
@@ -616,9 +524,6 @@ sym(vp9_highbd_filter_block1d4_h8_sse2):
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
; begin epilog
pop rdi
pop rsi
@@ -641,22 +546,11 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -665,6 +559,16 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.load
prefetcht0 [rsi - 6]
prefetcht0 [rsi + 23]
lea rsi, [rsi + rax]
dec rcx
jnz .load
mov rsi, arg(0)
movsxd rcx, DWORD PTR arg(4)
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
@@ -682,9 +586,6 @@ sym(vp9_highbd_filter_block1d8_h8_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
pop rsi
@@ -707,22 +608,11 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -731,6 +621,16 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.load
prefetcht0 [rsi - 6]
prefetcht0 [rsi + 31]
lea rsi, [rsi + rax]
dec rcx
jnz .load
mov rsi, arg(0)
movsxd rcx, DWORD PTR arg(4)
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
@@ -759,9 +659,6 @@ sym(vp9_highbd_filter_block1d16_h8_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
pop rsi
@@ -775,21 +672,11 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 14
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 7
%define k0k6 [rsp + 16 * 0]
%define k2k5 [rsp + 16 * 1]
%define k3k4 [rsp + 16 * 2]
%define k1k7 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define max [rsp + 16 * 5]
%define min [rsp + 16 * 6]
HIGH_GET_FILTERS_4
mov rsi, arg(0) ;src_ptr
@@ -801,6 +688,16 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.load
prefetcht0 [rsi - 6]
prefetcht0 [rsi + 17]
lea rsi, [rsi + rax]
dec rcx
jnz .load
mov rsi, arg(0)
movsxd rcx, DWORD PTR arg(4)
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm4, [rsi + 2]
@@ -825,9 +722,6 @@ sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
dec rcx
jnz .loop
add rsp, 16 * 7
pop rsp
; begin epilog
pop rdi
pop rsi
@@ -841,22 +735,11 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -865,6 +748,16 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.load
prefetcht0 [rsi - 6]
prefetcht0 [rsi + 23]
lea rsi, [rsi + rax]
dec rcx
jnz .load
mov rsi, arg(0)
movsxd rcx, DWORD PTR arg(4)
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
@@ -882,8 +775,6 @@ sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
@@ -898,22 +789,11 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM 7
SAVE_XMM 15
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 8
%define k0k1 [rsp + 16 * 0]
%define k6k7 [rsp + 16 * 1]
%define k2k5 [rsp + 16 * 2]
%define k3k4 [rsp + 16 * 3]
%define krd [rsp + 16 * 4]
%define temp [rsp + 16 * 5]
%define max [rsp + 16 * 6]
%define min [rsp + 16 * 7]
HIGH_GET_FILTERS
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
@@ -922,6 +802,16 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
lea rdx, [rdx + rdx]
movsxd rcx, DWORD PTR arg(4) ;output_height
.load
prefetcht0 [rsi - 6]
prefetcht0 [rsi + 31]
lea rsi, [rsi + rax]
dec rcx
jnz .load
mov rsi, arg(0)
movsxd rcx, DWORD PTR arg(4)
.loop:
movdqu xmm0, [rsi - 6] ;load src
movdqu xmm1, [rsi - 4]
@@ -950,9 +840,6 @@ sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
dec rcx
jnz .loop
add rsp, 16 * 8
pop rsp
; begin epilog
pop rdi
pop rsi

View File

@@ -9,6 +9,7 @@
*/
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
#include "vp9/common/vp9_idct.h"
#define RECON_AND_STORE4X4(dest, in_x) \
{ \
@@ -3985,3 +3986,573 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
dest += 8 - (stride * 32);
}
}
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
__m128i ubounded, retval;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
ubounded = _mm_cmpgt_epi16(value, max);
retval = _mm_andnot_si128(ubounded, value);
ubounded = _mm_and_si128(ubounded, max);
retval = _mm_or_si128(retval, ubounded);
retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
return retval;
}
void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
int i, j;
__m128i inptr[4];
__m128i sign_bits[2];
__m128i temp_mm, min_input, max_input;
int test;
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
int optimised_cols = 0;
const __m128i zero = _mm_set1_epi16(0);
const __m128i eight = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(12043);
const __m128i min = _mm_set1_epi16(-12043);
// Load input into __m128i
inptr[0] = _mm_loadu_si128((const __m128i *)input);
inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
// Pack to 16 bits
inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp_mm = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp_mm);
if (!test) {
// Do the row transform
idct4_sse2(inptr);
// Check the min & max values
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp_mm = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp_mm);
if (test) {
transpose_4x4(inptr);
sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
_mm_storeu_si128((__m128i*)outptr, inptr[0]);
_mm_storeu_si128((__m128i*)(outptr + 4), inptr[1]);
_mm_storeu_si128((__m128i*)(outptr + 8), inptr[2]);
_mm_storeu_si128((__m128i*)(outptr + 12), inptr[3]);
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vp9_highbd_idct4(input, outptr, bd);
input += 4;
outptr += 4;
}
}
if (optimised_cols) {
idct4_sse2(inptr);
// Final round and shift
inptr[0] = _mm_add_epi16(inptr[0], eight);
inptr[1] = _mm_add_epi16(inptr[1], eight);
inptr[0] = _mm_srai_epi16(inptr[0], 4);
inptr[1] = _mm_srai_epi16(inptr[1], 4);
// Reconstruction and Store
{
__m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
__m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
d0 = _mm_unpacklo_epi64(d0,
_mm_loadl_epi64((const __m128i *)(dest + stride)));
d2 = _mm_unpacklo_epi64(d2,
_mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
// store input0
_mm_storel_epi64((__m128i *)dest, d0);
// store input1
d0 = _mm_srli_si128(d0, 8);
_mm_storel_epi64((__m128i *)(dest + stride), d0);
// store input2
_mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
// store input3
d2 = _mm_srli_si128(d2, 8);
_mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[4], temp_out[4];
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
vp9_highbd_idct4(temp_in, temp_out, bd);
for (j = 0; j < 4; ++j)
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 4),
bd);
}
}
}
void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
const __m128i min = _mm_set1_epi16(-6201);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 8; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct8_sse2(inptr);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
array_transpose_8x8(inptr, inptr);
for (i = 0; i < 8; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 8; ++i) {
vp9_highbd_idct8(input, outptr, bd);
input += 8;
outptr += 8;
}
}
if (optimised_cols) {
idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
vp9_highbd_idct8(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j)
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 5),
bd);
}
}
}
void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[8];
__m128i min_input, max_input, temp1, temp2, sign_bits;
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i sixteen = _mm_set1_epi16(16);
const __m128i max = _mm_set1_epi16(6201);
const __m128i min = _mm_set1_epi16(-6201);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 8; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 8*i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 8*i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
// only first 4 row has non-zero coefs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 4; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct8_sse2(inptr);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
// Use fact only first 4 rows contain non-zero coeffs
array_transpose_4X8(inptr, inptr);
for (i = 0; i < 4; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i+1)), temp1);
_mm_storeu_si128((__m128i*)(outptr + 4*(2*i)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vp9_highbd_idct8(input, outptr, bd);
input += 8;
outptr += 8;
}
}
if (optimised_cols) {
idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
__m128i d[8];
for (i = 0; i < 8; i++) {
inptr[i] = _mm_add_epi16(inptr[i], sixteen);
d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
inptr[i] = _mm_srai_epi16(inptr[i], 5);
d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[8], temp_out[8];
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
vp9_highbd_idct8(temp_in, temp_out, bd);
for (j = 0; j < 8; ++j)
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 5),
bd);
}
}
}
void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
const __m128i min = _mm_set1_epi16(-3155);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 16; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 32; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform
idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 32; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
array_transpose_16x16(inptr, inptr + 16);
for (i = 0; i < 16; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 16; ++i) {
vp9_highbd_idct16(input, outptr, bd);
input += 16;
outptr += 16;
}
}
if (optimised_cols) {
idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
__m128i d[2];
for (i = 0; i < 16; i++) {
inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
vp9_highbd_idct16(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j)
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6),
bd);
}
}
}
void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
int i, j, test;
__m128i inptr[32];
__m128i min_input, max_input, temp1, temp2, sign_bits;
uint16_t * dest = CONVERT_TO_SHORTPTR(dest8);
const __m128i zero = _mm_set1_epi16(0);
const __m128i rounding = _mm_set1_epi16(32);
const __m128i max = _mm_set1_epi16(3155);
const __m128i min = _mm_set1_epi16(-3155);
int optimised_cols = 0;
// Load input into __m128i & pack to 16 bits
for (i = 0; i < 16; i++) {
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 4));
inptr[i] = _mm_packs_epi32(temp1, temp2);
temp1 = _mm_loadu_si128((const __m128i *)(input + 16*i + 8));
temp2 = _mm_loadu_si128((const __m128i *)(input + 16*i + 12));
inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
}
// Find the min & max for the row transform
// Since all non-zero dct coefficients are in upper-left 4x4 area,
// we only need to consider first 4 rows here.
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 4; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (!test) {
// Do the row transform (N.B. This transposes inptr)
idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
for (i = 2; i < 16; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
max_input = _mm_cmpgt_epi16(max_input, max);
min_input = _mm_cmplt_epi16(min_input, min);
temp1 = _mm_or_si128(max_input, min_input);
test = _mm_movemask_epi8(temp1);
if (test) {
// Use fact only first 4 rows contain non-zero coeffs
array_transpose_8x8(inptr, inptr);
array_transpose_8x8(inptr + 8, inptr + 16);
for (i = 0; i < 4; i++) {
sign_bits = _mm_cmplt_epi16(inptr[i], zero);
temp1 = _mm_unpacklo_epi16(inptr[i ], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i ], sign_bits);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4)), temp1);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+1)), temp2);
sign_bits = _mm_cmplt_epi16(inptr[i+16], zero);
temp1 = _mm_unpacklo_epi16(inptr[i+16], sign_bits);
temp2 = _mm_unpackhi_epi16(inptr[i+16], sign_bits);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+2)), temp1);
_mm_storeu_si128((__m128i*)(outptr + 4*(i*4+3)), temp2);
}
} else {
// Set to use the optimised transform for the column
optimised_cols = 1;
}
} else {
// Run the un-optimised row transform
for (i = 0; i < 4; ++i) {
vp9_highbd_idct16(input, outptr, bd);
input += 16;
outptr += 16;
}
}
if (optimised_cols) {
idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
__m128i d[2];
for (i = 0; i < 16; i++) {
inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
// Store
_mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
_mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
}
}
} else {
// Run the un-optimised column transform
tran_low_t temp_in[16], temp_out[16];
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j * 16 + i];
vp9_highbd_idct16(temp_in, temp_out, bd);
for (j = 0; j < 16; ++j)
dest[j * stride + i] = highbd_clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6),
bd);
}
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH

View File

@@ -7,6 +7,7 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/vp9_common.h"
#include "vpx_ports/mem.h"
unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
@@ -17,3 +18,16 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
return (sum + 32) >> 6;
}
#if CONFIG_VP9_HIGHBITDEPTH
unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
int i, j;
int sum = 0;
const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
for (i = 0; i < 8; ++i, s+=p)
for (j = 0; j < 8; sum += s[j], ++j) {}
return (sum + 32) >> 6;
}
#endif // CONFIG_VP9_HIGHBITDEPTH

View File

@@ -17,6 +17,7 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_dct.h"
static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
@@ -26,7 +27,7 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
return rv;
}
static void fdct4(const tran_low_t *input, tran_low_t *output) {
void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
tran_high_t step[4];
tran_high_t temp1, temp2;
@@ -123,7 +124,7 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
}
}
static void fadst4(const tran_low_t *input, tran_low_t *output) {
void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
tran_high_t x0, x1, x2, x3;
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -163,13 +164,6 @@ static void fadst4(const tran_low_t *input, tran_low_t *output) {
output[3] = fdct_round_shift(s3);
}
static const transform_2d FHT_4[] = {
{ fdct4, fdct4 }, // DCT_DCT = 0
{ fadst4, fdct4 }, // ADST_DCT = 1
{ fdct4, fadst4 }, // DCT_ADST = 2
{ fadst4, fadst4 } // ADST_ADST = 3
};
void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
@@ -203,7 +197,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
}
}
static void fdct8(const tran_low_t *input, tran_low_t *output) {
void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
tran_high_t t0, t1, t2, t3; // needs32
tran_high_t x0, x1, x2, x3; // canbe16
@@ -331,7 +325,7 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
// Rows
for (i = 0; i < 8; ++i) {
fdct8(&intermediate[i * 8], &final_output[i * 8]);
vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
for (j = 0; j < 8; ++j)
final_output[j + i * 8] /= 2;
}
@@ -528,7 +522,7 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
}
}
static void fadst8(const tran_low_t *input, tran_low_t *output) {
void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
tran_high_t x0 = input[7];
@@ -599,13 +593,6 @@ static void fadst8(const tran_low_t *input, tran_low_t *output) {
output[7] = - x1;
}
static const transform_2d FHT_8[] = {
{ fdct8, fdct8 }, // DCT_DCT = 0
{ fadst8, fdct8 }, // ADST_DCT = 1
{ fdct8, fadst8 }, // DCT_ADST = 2
{ fadst8, fadst8 } // ADST_ADST = 3
};
void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
@@ -694,7 +681,7 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
}
// Rewrote to use same algorithm as others.
static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
tran_high_t step1[8]; // canbe16
tran_high_t step2[8]; // canbe16
tran_high_t step3[8]; // canbe16
@@ -835,7 +822,7 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
out[15] = fdct_round_shift(temp2);
}
static void fadst16(const tran_low_t *input, tran_low_t *output) {
void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
tran_high_t s9, s10, s11, s12, s13, s14, s15;
@@ -998,13 +985,6 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
output[15] = - x1;
}
static const transform_2d FHT_16[] = {
{ fdct16, fdct16 }, // DCT_DCT = 0
{ fadst16, fdct16 }, // ADST_DCT = 1
{ fdct16, fadst16 }, // DCT_ADST = 2
{ fadst16, fadst16 } // ADST_ADST = 3
};
void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
@@ -1049,7 +1029,7 @@ static INLINE tran_high_t half_round_shift(tran_high_t input) {
return rv;
}
static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
tran_high_t step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
@@ -1392,7 +1372,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
fdct32(temp_in, temp_out, 0);
vp9_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
@@ -1402,7 +1382,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
fdct32(temp_in, temp_out, 0);
vp9_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
@@ -1420,7 +1400,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
fdct32(temp_in, temp_out, 0);
vp9_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing
// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1433,7 +1413,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
fdct32(temp_in, temp_out, 1);
vp9_fdct32(temp_in, temp_out, 1);
for (j = 0; j < 32; ++j)
out[j + i * 32] = temp_out[j];
}

61
vp9/encoder/vp9_dct.h Normal file
View File

@@ -0,0 +1,61 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_VP9_DCT_H_
#define VP9_ENCODER_VP9_DCT_H_
#include "vp9/common/vp9_idct.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
int stride);
void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);
void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
int stride);
void vp9_fdct4(const tran_low_t *input, tran_low_t *output);
void vp9_fadst4(const tran_low_t *input, tran_low_t *output);
void vp9_fdct8(const tran_low_t *input, tran_low_t *output);
void vp9_fadst8(const tran_low_t *input, tran_low_t *output);
void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);
void vp9_fadst16(const tran_low_t *input, tran_low_t *output);
void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);
static const transform_2d FHT_4[] = {
{ vp9_fdct4, vp9_fdct4 }, // DCT_DCT = 0
{ vp9_fadst4, vp9_fdct4 }, // ADST_DCT = 1
{ vp9_fdct4, vp9_fadst4 }, // DCT_ADST = 2
{ vp9_fadst4, vp9_fadst4 } // ADST_ADST = 3
};
static const transform_2d FHT_8[] = {
{ vp9_fdct8, vp9_fdct8 }, // DCT_DCT = 0
{ vp9_fadst8, vp9_fdct8 }, // ADST_DCT = 1
{ vp9_fdct8, vp9_fadst8 }, // DCT_ADST = 2
{ vp9_fadst8, vp9_fadst8 } // ADST_ADST = 3
};
static const transform_2d FHT_16[] = {
{ vp9_fdct16, vp9_fdct16 }, // DCT_DCT = 0
{ vp9_fadst16, vp9_fdct16 }, // ADST_DCT = 1
{ vp9_fdct16, vp9_fadst16 }, // DCT_ADST = 2
{ vp9_fadst16, vp9_fadst16 } // ADST_ADST = 3
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP9_ENCODER_VP9_DCT_H_

View File

@@ -515,8 +515,19 @@ static void choose_partitioning(VP9_COMP *cpi,
int sum = 0;
if (x_idx < pixels_wide && y_idx < pixels_high) {
int s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
int d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
int s_avg, d_avg;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
} else {
s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
}
#else
s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
#endif
sum = s_avg - d_avg;
sse = sum * sum;
}
@@ -3414,9 +3425,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
else
x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
else
x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
vp9_highbd_idct4x4_add;
#else

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -62,9 +62,40 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
psllw m2, 2
psllw m3, 2
%if CONFIG_VP9_HIGHBITDEPTH
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m0
pcmpgtw m5, m1
movq m6, m0
movq m7, m1
punpcklwd m0, m4
punpcklwd m1, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq], m0
movq [outputq + 8], m6
movq [outputq + 16], m1
movq [outputq + 24], m7
pxor m4, m4
pxor m5, m5
pcmpgtw m4, m2
pcmpgtw m5, m3
movq m6, m2
movq m7, m3
punpcklwd m2, m4
punpcklwd m3, m5
punpckhwd m6, m4
punpckhwd m7, m5
movq [outputq + 32], m2
movq [outputq + 40], m6
movq [outputq + 48], m3
movq [outputq + 56], m7
%else
movq [outputq], m0
movq [outputq + 8], m1
movq [outputq + 16], m2
movq [outputq + 24], m3
%endif
RET

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,368 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_X86_VP9_DCT_SSE2_H_
#define VP9_ENCODER_X86_VP9_DCT_SSE2_H_
#ifdef __cplusplus
extern "C" {
#endif
#define pair_set_epi32(a, b) \
_mm_set_epi32(b, a, b, a)
void vp9_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
void vp9_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
void vp9_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
void vp9_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output,
int stride);
void vp9_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output,
int stride);
void vp9_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output,
int stride);
static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
__m128i buf0, buf1;
buf0 = _mm_mul_epu32(a, b);
a = _mm_srli_epi64(a, 32);
b = _mm_srli_epi64(b, 32);
buf1 = _mm_mul_epu32(a, b);
return _mm_add_epi64(buf0, buf1);
}
static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
__m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
__m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
return _mm_unpacklo_epi64(buf0, buf1);
}
static INLINE int check_epi16_overflow_x2(__m128i reg0, __m128i reg1) {
const __m128i max_overflow = _mm_set1_epi16(0x7fff);
const __m128i min_overflow = _mm_set1_epi16(0x8000);
__m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
_mm_cmpeq_epi16(reg0, min_overflow));
__m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
_mm_cmpeq_epi16(reg1, min_overflow));
cmp0 = _mm_or_si128(cmp0, cmp1);
return _mm_movemask_epi8(cmp0);
}
static INLINE int check_epi16_overflow_x4(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3) {
const __m128i max_overflow = _mm_set1_epi16(0x7fff);
const __m128i min_overflow = _mm_set1_epi16(0x8000);
__m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
_mm_cmpeq_epi16(reg0, min_overflow));
__m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
_mm_cmpeq_epi16(reg1, min_overflow));
__m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(reg2, max_overflow),
_mm_cmpeq_epi16(reg2, min_overflow));
__m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(reg3, max_overflow),
_mm_cmpeq_epi16(reg3, min_overflow));
cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
return _mm_movemask_epi8(cmp0);
}
static INLINE int check_epi16_overflow_x8(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, __m128i reg4,
__m128i reg5, __m128i reg6, __m128i reg7) {
int res0, res1;
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
return res0 + res1;
}
static INLINE int check_epi16_overflow_x12(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, __m128i reg4,
__m128i reg5, __m128i reg6, __m128i reg7,
__m128i reg8, __m128i reg9, __m128i reg10,
__m128i reg11) {
int res0, res1;
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
if (!res0)
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
return res0 + res1;
}
static INLINE int check_epi16_overflow_x16(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, __m128i reg4,
__m128i reg5, __m128i reg6, __m128i reg7,
__m128i reg8, __m128i reg9, __m128i reg10,
__m128i reg11, __m128i reg12, __m128i reg13,
__m128i reg14, __m128i reg15) {
int res0, res1;
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
if (!res0) {
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
if (!res1)
res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
}
return res0 + res1;
}
static INLINE int check_epi16_overflow_x32(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, __m128i reg4,
__m128i reg5, __m128i reg6, __m128i reg7,
__m128i reg8, __m128i reg9, __m128i reg10,
__m128i reg11, __m128i reg12, __m128i reg13,
__m128i reg14, __m128i reg15, __m128i reg16,
__m128i reg17, __m128i reg18, __m128i reg19,
__m128i reg20, __m128i reg21, __m128i reg22,
__m128i reg23, __m128i reg24, __m128i reg25,
__m128i reg26, __m128i reg27, __m128i reg28,
__m128i reg29, __m128i reg30, __m128i reg31) {
int res0, res1;
res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
if (!res0) {
res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
if (!res1) {
res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
if (!res0) {
res0 = check_epi16_overflow_x4(reg16, reg17, reg18, reg19);
if (!res1) {
res1 = check_epi16_overflow_x4(reg20, reg21, reg22, reg23);
if (!res0) {
res0 = check_epi16_overflow_x4(reg24, reg25, reg26, reg27);
if (!res1)
res1 = check_epi16_overflow_x4(reg28, reg29, reg30, reg31);
}
}
}
}
}
return res0 + res1;
}
static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, const __m128i* zero) {
__m128i minus_one = _mm_set1_epi32(-1);
// Check for overflows
__m128i reg0_shifted = _mm_slli_epi64(reg0, 1);
__m128i reg1_shifted = _mm_slli_epi64(reg1, 1);
__m128i reg2_shifted = _mm_slli_epi64(reg2, 1);
__m128i reg3_shifted = _mm_slli_epi64(reg3, 1);
__m128i reg0_top_dwords = _mm_shuffle_epi32(
reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
__m128i reg1_top_dwords = _mm_shuffle_epi32(
reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
__m128i reg2_top_dwords = _mm_shuffle_epi32(
reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
__m128i reg3_top_dwords = _mm_shuffle_epi32(
reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
__m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
__m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
__m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
__m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
__m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
__m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
int overflow_01 = _mm_movemask_epi8(
_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
int overflow_23 = _mm_movemask_epi8(
_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
return (overflow_01 + overflow_23);
}
static INLINE int k_check_epi32_overflow_8(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
__m128i reg6, __m128i reg7, const __m128i* zero) {
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
}
return overflow;
}
static INLINE int k_check_epi32_overflow_16(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
__m128i reg6, __m128i reg7, __m128i reg8, __m128i reg9,
__m128i reg10, __m128i reg11, __m128i reg12, __m128i reg13,
__m128i reg14, __m128i reg15, const __m128i* zero) {
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
}
}
}
return overflow;
}
static INLINE int k_check_epi32_overflow_32(__m128i reg0, __m128i reg1,
__m128i reg2, __m128i reg3, __m128i reg4, __m128i reg5,
__m128i reg6, __m128i reg7, __m128i reg8, __m128i reg9,
__m128i reg10, __m128i reg11, __m128i reg12, __m128i reg13,
__m128i reg14, __m128i reg15, __m128i reg16, __m128i reg17,
__m128i reg18, __m128i reg19, __m128i reg20, __m128i reg21,
__m128i reg22, __m128i reg23, __m128i reg24, __m128i reg25,
__m128i reg26, __m128i reg27, __m128i reg28, __m128i reg29,
__m128i reg30, __m128i reg31, const __m128i* zero) {
int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg16, reg17, reg18, reg19, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg20, reg21,
reg22, reg23, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg24, reg25,
reg26, reg27, zero);
if (!overflow) {
overflow = k_check_epi32_overflow_4(reg28, reg29,
reg30, reg31, zero);
}
}
}
}
}
}
}
return overflow;
}
static INLINE void store_output(const __m128i output, tran_low_t* dst_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
__m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
__m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
_mm_store_si128((__m128i *)(dst_ptr), out0);
_mm_store_si128((__m128i *)(dst_ptr + 4), out1);
#else
_mm_store_si128((__m128i *)(dst_ptr), output);
#endif
}
static INLINE void storeu_output(const __m128i output, tran_low_t* dst_ptr) {
#if CONFIG_VP9_HIGHBITDEPTH
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
__m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
__m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
_mm_storeu_si128((__m128i *)(dst_ptr), out0);
_mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
#else
_mm_storeu_si128((__m128i *)(dst_ptr), output);
#endif
}
static INLINE __m128i mult_round_shift(const __m128i in0, const __m128i in1,
const __m128i multiplier,
const __m128i rounding,
const int shift) {
const __m128i u0 = _mm_madd_epi16(in0, multiplier);
const __m128i u1 = _mm_madd_epi16(in1, multiplier);
const __m128i v0 = _mm_add_epi32(u0, rounding);
const __m128i v1 = _mm_add_epi32(u1, rounding);
const __m128i w0 = _mm_srai_epi32(v0, shift);
const __m128i w1 = _mm_srai_epi32(v1, shift);
return _mm_packs_epi32(w0, w1);
}
static INLINE void transpose_and_output8x8(
const __m128i in00, const __m128i in01,
const __m128i in02, const __m128i in03,
const __m128i in04, const __m128i in05,
const __m128i in06, const __m128i in07,
const int pass, int16_t* out0_ptr,
tran_low_t* out1_ptr) {
// 00 01 02 03 04 05 06 07
// 10 11 12 13 14 15 16 17
// 20 21 22 23 24 25 26 27
// 30 31 32 33 34 35 36 37
// 40 41 42 43 44 45 46 47
// 50 51 52 53 54 55 56 57
// 60 61 62 63 64 65 66 67
// 70 71 72 73 74 75 76 77
const __m128i tr0_0 = _mm_unpacklo_epi16(in00, in01);
const __m128i tr0_1 = _mm_unpacklo_epi16(in02, in03);
const __m128i tr0_2 = _mm_unpackhi_epi16(in00, in01);
const __m128i tr0_3 = _mm_unpackhi_epi16(in02, in03);
const __m128i tr0_4 = _mm_unpacklo_epi16(in04, in05);
const __m128i tr0_5 = _mm_unpacklo_epi16(in06, in07);
const __m128i tr0_6 = _mm_unpackhi_epi16(in04, in05);
const __m128i tr0_7 = _mm_unpackhi_epi16(in06, in07);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
// 04 14 05 15 06 16 07 17
// 24 34 25 35 26 36 27 37
// 40 50 41 51 42 52 43 53
// 60 70 61 71 62 72 63 73
// 54 54 55 55 56 56 57 57
// 64 74 65 75 66 76 67 77
const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
// 00 10 20 30 01 11 21 31
// 40 50 60 70 41 51 61 71
// 02 12 22 32 03 13 23 33
// 42 52 62 72 43 53 63 73
// 04 14 24 34 05 15 21 36
// 44 54 64 74 45 55 61 76
// 06 16 26 36 07 17 27 37
// 46 56 66 76 47 57 67 77
const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
// 00 10 20 30 40 50 60 70
// 01 11 21 31 41 51 61 71
// 02 12 22 32 42 52 62 72
// 03 13 23 33 43 53 63 73
// 04 14 24 34 44 54 64 74
// 05 15 25 35 45 55 65 75
// 06 16 26 36 46 56 66 76
// 07 17 27 37 47 57 67 77
if (pass == 0) {
_mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
_mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
_mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
_mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
_mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
_mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
_mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
_mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
} else {
storeu_output(tr2_0, (out1_ptr + 0 * 16));
storeu_output(tr2_1, (out1_ptr + 1 * 16));
storeu_output(tr2_2, (out1_ptr + 2 * 16));
storeu_output(tr2_3, (out1_ptr + 3 * 16));
storeu_output(tr2_4, (out1_ptr + 4 * 16));
storeu_output(tr2_5, (out1_ptr + 5 * 16));
storeu_output(tr2_6, (out1_ptr + 6 * 16));
storeu_output(tr2_7, (out1_ptr + 7 * 16));
}
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP9_ENCODER_X86_VP9_DCT_SSE2_H_

View File

@@ -0,0 +1,71 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h>
#include <stdio.h>
#include "vp9/common/vp9_common.h"
int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff,
tran_low_t *dqcoeff, intptr_t block_size,
int64_t *ssz, int bps) {
int i, j, test;
uint32_t temp[4];
__m128i max, min, cmp0, cmp1, cmp2, cmp3;
int64_t error = 0, sqcoeff = 0;
int shift = 2 * (bps - 8);
int rounding = shift > 0 ? 1 << (shift - 1) : 0;
for (i = 0; i < block_size; i+=8) {
// Load the data into xmm registers
__m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
__m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
__m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
__m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
// Check if any values require more than 15 bit
max = _mm_set1_epi32(0x3fff);
min = _mm_set1_epi32(0xffffc000);
cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
_mm_cmplt_epi32(mm_coeff, min));
cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
_mm_cmplt_epi32(mm_coeff2, min));
cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
_mm_cmplt_epi32(mm_dqcoeff, min));
cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
_mm_cmplt_epi32(mm_dqcoeff2, min));
test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
_mm_or_si128(cmp2, cmp3)));
if (!test) {
__m128i mm_diff, error_sse2, sqcoeff_sse2;;
mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
_mm_storeu_si128((__m128i*)temp, error_sse2);
error = error + temp[0] + temp[1] + temp[2] + temp[3];
_mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
} else {
for (j = 0; j < 8; j++) {
const int64_t diff = coeff[i+j] - dqcoeff[i+j];
error += diff * diff;
sqcoeff += (int64_t)coeff[i+j] * (int64_t)coeff[i+j];
}
}
}
assert(error >= 0 && sqcoeff >= 0);
error = (error + rounding) >> shift;
sqcoeff = (sqcoeff + rounding) >> shift;
*ssz = sqcoeff;
return error;
}

View File

@@ -0,0 +1,173 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <emmintrin.h> // SSE2
#include "vp9/common/vp9_common.h"
#if CONFIG_VP9_HIGHBITDEPTH
// from vp9_idct.h: typedef int32_t tran_low_t;
void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
__m128i zbins[2];
__m128i nzbins[2];
zbins[0] = _mm_set_epi32((int)(zbin_ptr[1] + zbin_oq_value),
(int)(zbin_ptr[1] + zbin_oq_value),
(int)(zbin_ptr[1] + zbin_oq_value),
(int)(zbin_ptr[0] + zbin_oq_value));
zbins[1] = _mm_set1_epi32((int)(zbin_ptr[1] + zbin_oq_value));
nzbins[0] = _mm_setzero_si128();
nzbins[1] = _mm_setzero_si128();
nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
(void)scan;
vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
for (i = ((int)count / 4) - 1; i >= 0; i--) {
__m128i coeffs, cmp1, cmp2;
int test;
coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
cmp1 = _mm_and_si128(cmp1, cmp2);
test = _mm_movemask_epi8(cmp1);
if (test == 0xffff)
non_zero_regs--;
else
break;
}
// Quantization pass:
for (i = 0; i < non_zero_regs; i++) {
__m128i coeffs, coeffs_sign, tmp1, tmp2;
int test;
int abs_coeff[4];
int coeff_sign[4];
coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
coeffs_sign = _mm_srai_epi32(coeffs, 31);
coeffs = _mm_sub_epi32(
_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
tmp1 = _mm_or_si128(tmp1, tmp2);
test = _mm_movemask_epi8(tmp1);
_mm_storeu_si128((__m128i*)abs_coeff, coeffs);
_mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
for (j = 0; j < 4; j++) {
if (test & (1 << (4*j))) {
int k = 4 * i + j;
int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
INT32_MIN, INT32_MAX);
tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
quant_shift_ptr[k != 0]) >> 16; // quantization
qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
if (tmp)
eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
}
}
}
}
*eob_ptr = eob_i + 1;
}
void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
intptr_t n_coeffs, int skip_block,
const int16_t *zbin_ptr,
const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
__m128i zbins[2];
__m128i nzbins[2];
int idx = 0;
int idx_arr[1024];
int i, eob = -1;
const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
(void)scan;
zbins[0] = _mm_set_epi32((zbin1_tmp + zbin_oq_value),
(zbin1_tmp + zbin_oq_value),
(zbin1_tmp + zbin_oq_value),
(zbin0_tmp + zbin_oq_value));
zbins[1] = _mm_set1_epi32((zbin1_tmp + zbin_oq_value));
nzbins[0] = _mm_setzero_si128();
nzbins[1] = _mm_setzero_si128();
nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
for (i = 0; i < n_coeffs / 4; i++) {
__m128i coeffs, cmp1, cmp2;
int test;
coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
cmp1 = _mm_and_si128(cmp1, cmp2);
test = _mm_movemask_epi8(cmp1);
if (!(test & 0xf))
idx_arr[idx++] = i*4;
if (!(test & 0xf0))
idx_arr[idx++] = i*4 + 1;
if (!(test & 0xf00))
idx_arr[idx++] = i*4 + 2;
if (!(test & 0xf000))
idx_arr[idx++] = i*4 + 3;
}
// Quantization pass: only process the coefficients selected in
// pre-scan pass. Note: idx can be zero.
for (i = 0; i < idx; i++) {
const int rc = idx_arr[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
int64_t tmp = clamp(abs_coeff +
ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
INT32_MIN, INT32_MAX);
tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
quant_shift_ptr[rc != 0]) >> 15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
if (tmp)
eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
}
}
*eob_ptr = eob + 1;
}
#endif

View File

@@ -0,0 +1,284 @@
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_4x2x4 5-6 0
movh m0, [srcq +%2*2]
%if %1 == 1
movu m4, [ref1q+%3*2]
movu m5, [ref2q+%3*2]
movu m6, [ref3q+%3*2]
movu m7, [ref4q+%3*2]
movhps m0, [srcq +%4*2]
movhps m4, [ref1q+%5*2]
movhps m5, [ref2q+%5*2]
movhps m6, [ref3q+%5*2]
movhps m7, [ref4q+%5*2]
mova m3, m0
mova m2, m0
psubusw m3, m4
psubusw m2, m5
psubusw m4, m0
psubusw m5, m0
por m4, m3
por m5, m2
pmaddwd m4, m1
pmaddwd m5, m1
mova m3, m0
mova m2, m0
psubusw m3, m6
psubusw m2, m7
psubusw m6, m0
psubusw m7, m0
por m6, m3
por m7, m2
pmaddwd m6, m1
pmaddwd m7, m1
%else
movu m2, [ref1q+%3*2]
movhps m0, [srcq +%4*2]
movhps m2, [ref1q+%5*2]
mova m3, m0
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m4, m2
movu m2, [ref2q+%3*2]
mova m3, m0
movhps m2, [ref2q+%5*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m5, m2
movu m2, [ref3q+%3*2]
mova m3, m0
movhps m2, [ref3q+%5*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m6, m2
movu m2, [ref4q+%3*2]
mova m3, m0
movhps m2, [ref4q+%5*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m7, m2
%endif
%if %6 == 1
lea srcq, [srcq +src_strideq*4]
lea ref1q, [ref1q+ref_strideq*4]
lea ref2q, [ref2q+ref_strideq*4]
lea ref3q, [ref3q+ref_strideq*4]
lea ref4q, [ref4q+ref_strideq*4]
%endif
%endmacro
; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_8x2x4 5-6 0
; 1st 8 px
mova m0, [srcq +%2*2]
%if %1 == 1
movu m4, [ref1q+%3*2]
movu m5, [ref2q+%3*2]
movu m6, [ref3q+%3*2]
movu m7, [ref4q+%3*2]
mova m3, m0
mova m2, m0
psubusw m3, m4
psubusw m2, m5
psubusw m4, m0
psubusw m5, m0
por m4, m3
por m5, m2
pmaddwd m4, m1
pmaddwd m5, m1
mova m3, m0
mova m2, m0
psubusw m3, m6
psubusw m2, m7
psubusw m6, m0
psubusw m7, m0
por m6, m3
por m7, m2
pmaddwd m6, m1
pmaddwd m7, m1
%else
mova m3, m0
movu m2, [ref1q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m4, m2
movu m2, [ref2q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m5, m2
movu m2, [ref3q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m6, m2
movu m2, [ref4q+%3*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
pmaddwd m2, m1
paddd m7, m2
%endif
; 2nd 8 px
mova m0, [srcq +(%4)*2]
mova m3, m0
movu m2, [ref1q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m4, m2
movu m2, [ref2q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m5, m2
movu m2, [ref3q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
por m2, m3
mova m3, m0
pmaddwd m2, m1
paddd m6, m2
movu m2, [ref4q+(%5)*2]
psubusw m3, m2
psubusw m2, m0
%if %6 == 1
lea srcq, [srcq +src_strideq*4]
lea ref1q, [ref1q+ref_strideq*4]
lea ref2q, [ref2q+ref_strideq*4]
lea ref3q, [ref3q+ref_strideq*4]
lea ref4q, [ref4q+ref_strideq*4]
%endif
por m2, m3
pmaddwd m2, m1
paddd m7, m2
%endmacro
; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_16x2x4 5-6 0
HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
%endmacro
; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_32x2x4 5-6 0
HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
%endmacro
; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
%macro HIGH_PROCESS_64x2x4 5-6 0
HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
%endmacro
; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
; uint8_t *ref[4], int ref_stride,
; unsigned int res[4]);
; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
%macro HIGH_SADNXN4D 2
%if UNIX64
cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \
res, ref2, ref3, ref4, one
%else
cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \
ref2, ref3, ref4, one
%endif
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
mov ref2q, [ref1q+gprsize*1]
mov ref3q, [ref1q+gprsize*2]
mov ref4q, [ref1q+gprsize*3]
mov ref1q, [ref1q+gprsize*0]
; convert byte pointers to short pointers
shl srcq, 1
shl ref2q, 1
shl ref3q, 1
shl ref4q, 1
shl ref1q, 1
mov oned, 0x00010001
movd m1, oned
pshufd m1, m1, 0x0
HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
%rep (%2-4)/2
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
%endrep
HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
; N.B. HIGH_PROCESS outputs dwords (32 bits)
; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
movhlps m0, m4
movhlps m1, m5
movhlps m2, m6
movhlps m3, m7
paddd m4, m0
paddd m5, m1
paddd m6, m2
paddd m7, m3
punpckldq m4, m5
punpckldq m6, m7
movhlps m0, m4
movhlps m1, m6
paddd m4, m0
paddd m6, m1
punpcklqdq m4, m6
movifnidn r4, r4mp
movu [r4], m4
RET
%endmacro
INIT_XMM sse2
HIGH_SADNXN4D 64, 64
HIGH_SADNXN4D 64, 32
HIGH_SADNXN4D 32, 64
HIGH_SADNXN4D 32, 32
HIGH_SADNXN4D 32, 16
HIGH_SADNXN4D 16, 32
HIGH_SADNXN4D 16, 16
HIGH_SADNXN4D 16, 8
HIGH_SADNXN4D 8, 16
HIGH_SADNXN4D 8, 8
HIGH_SADNXN4D 8, 4
HIGH_SADNXN4D 4, 8
HIGH_SADNXN4D 4, 4

View File

@@ -0,0 +1,363 @@
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "third_party/x86inc/x86inc.asm"
SECTION .text
%macro HIGH_SAD_FN 4
%if %4 == 0
%if %3 == 5
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
%else ; %3 == 7
cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
src_stride3, ref_stride3, n_rows
%endif ; %3 == 5/7
%else ; avg
%if %3 == 5
cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
second_pred, n_rows
%else ; %3 == 7
cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
ref, ref_stride, \
second_pred, \
src_stride3, ref_stride3
%if ARCH_X86_64
%define n_rowsd r7d
%else ; x86-32
%define n_rowsd dword r0m
%endif ; x86-32/64
%endif ; %3 == 5/7
%endif ; avg/sad
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
%if %3 == 7
lea src_stride3q, [src_strideq*3]
lea ref_stride3q, [ref_strideq*3]
%endif ; %3 == 7
; convert src, ref & second_pred to short ptrs (from byte ptrs)
shl srcq, 1
shl refq, 1
%if %4 == 1
shl second_predq, 1
%endif
%endmacro
; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD64XN 1-2 0
HIGH_SAD_FN 64, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
pxor m6, m6
.loop:
; first half of each row
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+mmsize*1]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
mova m5, [srcq]
psubusw m5, m1
psubusw m1, [srcq]
por m1, m5
mova m5, [srcq+16]
psubusw m5, m2
psubusw m2, [srcq+16]
por m2, m5
mova m5, [srcq+32]
psubusw m5, m3
psubusw m3, [srcq+32]
por m3, m5
mova m5, [srcq+48]
psubusw m5, m4
psubusw m4, [srcq+48]
por m4, m5
paddw m1, m2
paddw m3, m4
movhlps m2, m1
movhlps m4, m3
paddw m1, m2
paddw m3, m4
punpcklwd m1, m6
punpcklwd m3, m6
paddd m0, m1
paddd m0, m3
; second half of each row
movu m1, [refq+64]
movu m2, [refq+80]
movu m3, [refq+96]
movu m4, [refq+112]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+mmsize*1]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
mova m5, [srcq+64]
psubusw m5, m1
psubusw m1, [srcq+64]
por m1, m5
mova m5, [srcq+80]
psubusw m5, m2
psubusw m2, [srcq+80]
por m2, m5
mova m5, [srcq+96]
psubusw m5, m3
psubusw m3, [srcq+96]
por m3, m5
mova m5, [srcq+112]
psubusw m5, m4
psubusw m4, [srcq+112]
por m4, m5
paddw m1, m2
paddw m3, m4
movhlps m2, m1
movhlps m4, m3
paddw m1, m2
paddw m3, m4
punpcklwd m1, m6
punpcklwd m3, m6
lea refq, [refq+ref_strideq*2]
paddd m0, m1
lea srcq, [srcq+src_strideq*2]
paddd m0, m3
dec n_rowsd
jg .loop
movhlps m1, m0
paddd m0, m1
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD32XN 1-2 0
HIGH_SAD_FN 32, %1, 5, %2
mov n_rowsd, %1
pxor m0, m0
pxor m6, m6
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+32]
movu m4, [refq+48]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+mmsize*1]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
mova m5, [srcq]
psubusw m5, m1
psubusw m1, [srcq]
por m1, m5
mova m5, [srcq+16]
psubusw m5, m2
psubusw m2, [srcq+16]
por m2, m5
mova m5, [srcq+32]
psubusw m5, m3
psubusw m3, [srcq+32]
por m3, m5
mova m5, [srcq+48]
psubusw m5, m4
psubusw m4, [srcq+48]
por m4, m5
paddw m1, m2
paddw m3, m4
movhlps m2, m1
movhlps m4, m3
paddw m1, m2
paddw m3, m4
punpcklwd m1, m6
punpcklwd m3, m6
lea refq, [refq+ref_strideq*2]
paddd m0, m1
lea srcq, [srcq+src_strideq*2]
paddd m0, m3
dec n_rowsd
jg .loop
movhlps m1, m0
paddd m0, m1
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD16XN 1-2 0
HIGH_SAD_FN 16, %1, 5, %2
mov n_rowsd, %1/2
pxor m0, m0
pxor m6, m6
.loop:
movu m1, [refq]
movu m2, [refq+16]
movu m3, [refq+ref_strideq*2]
movu m4, [refq+ref_strideq*2+16]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+16]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*2+16]
lea second_predq, [second_predq+mmsize*4]
%endif
mova m5, [srcq]
psubusw m5, m1
psubusw m1, [srcq]
por m1, m5
mova m5, [srcq+16]
psubusw m5, m2
psubusw m2, [srcq+16]
por m2, m5
mova m5, [srcq+src_strideq*2]
psubusw m5, m3
psubusw m3, [srcq+src_strideq*2]
por m3, m5
mova m5, [srcq+src_strideq*2+16]
psubusw m5, m4
psubusw m4, [srcq+src_strideq*2+16]
por m4, m5
paddw m1, m2
paddw m3, m4
movhlps m2, m1
movhlps m4, m3
paddw m1, m2
paddw m3, m4
punpcklwd m1, m6
punpcklwd m3, m6
lea refq, [refq+ref_strideq*4]
paddd m0, m1
lea srcq, [srcq+src_strideq*4]
paddd m0, m3
dec n_rowsd
jg .loop
movhlps m1, m0
paddd m0, m1
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
; uint8_t *ref, int ref_stride);
%macro HIGH_SAD8XN 1-2 0
HIGH_SAD_FN 8, %1, 7, %2
mov n_rowsd, %1/4
pxor m0, m0
pxor m6, m6
.loop:
movu m1, [refq]
movu m2, [refq+ref_strideq*2]
movu m3, [refq+ref_strideq*4]
movu m4, [refq+ref_stride3q*2]
%if %2 == 1
pavgw m1, [second_predq+mmsize*0]
pavgw m2, [second_predq+mmsize*1]
pavgw m3, [second_predq+mmsize*2]
pavgw m4, [second_predq+mmsize*3]
lea second_predq, [second_predq+mmsize*4]
%endif
mova m5, [srcq]
psubusw m5, m1
psubusw m1, [srcq]
por m1, m5
mova m5, [srcq+src_strideq*2]
psubusw m5, m2
psubusw m2, [srcq+src_strideq*2]
por m2, m5
mova m5, [srcq+src_strideq*4]
psubusw m5, m3
psubusw m3, [srcq+src_strideq*4]
por m3, m5
mova m5, [srcq+src_stride3q*2]
psubusw m5, m4
psubusw m4, [srcq+src_stride3q*2]
por m4, m5
paddw m1, m2
paddw m3, m4
movhlps m2, m1
movhlps m4, m3
paddw m1, m2
paddw m3, m4
punpcklwd m1, m6
punpcklwd m3, m6
lea refq, [refq+ref_strideq*8]
paddd m0, m1
lea srcq, [srcq+src_strideq*8]
paddd m0, m3
dec n_rowsd
jg .loop
movhlps m1, m0
paddd m0, m1
punpckldq m0, m6
movhlps m1, m0
paddd m0, m1
movd eax, m0
RET
%endmacro
INIT_XMM sse2
HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,313 @@
;
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;unsigned int vp9_highbd_calc16x16var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
sym(vp9_highbd_calc16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
add rax, rax ; source stride in bytes
add rdx, rdx ; recon stride in bytes
; Prefetch data
prefetcht0 [rsi]
prefetcht0 [rsi+16]
prefetcht0 [rsi+rax]
prefetcht0 [rsi+rax+16]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax+16]
prefetcht0 [rdi]
prefetcht0 [rdi+16]
prefetcht0 [rdi+rdx]
prefetcht0 [rdi+rdx+16]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx+16]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 16
.var16loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rax]
prefetcht0 [rbx+rax+16]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+16]
prefetcht0 [rbx+rdx]
prefetcht0 [rbx+rdx+16]
pxor xmm5, xmm5
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+16]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+16]
paddd xmm6, xmm1
psubw xmm3, xmm2
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm3
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax+16]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
paddd xmm6, xmm1
psubw xmm3, xmm2
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm3
movdqa xmm1, xmm5
movdqa xmm2, xmm5
pcmpgtw xmm1, xmm0
pcmpeqw xmm2, xmm0
por xmm1, xmm2
pcmpeqw xmm1, xmm0
movdqa xmm2, xmm5
punpcklwd xmm5, xmm1
punpckhwd xmm2, xmm1
paddd xmm7, xmm5
paddd xmm7, xmm2
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
sub rcx, 2
jnz .var16loop
movdqa xmm4, xmm6
punpckldq xmm6, xmm0
punpckhdq xmm4, xmm0
movdqa xmm5, xmm7
paddd xmm6, xmm4
punpckldq xmm7, xmm0
punpckhdq xmm5, xmm0
paddd xmm7, xmm5
movdqa xmm4, xmm6
movdqa xmm5, xmm7
psrldq xmm4, 8
psrldq xmm5, 8
paddd xmm6, xmm4
paddd xmm7, xmm5
mov rdi, arg(4) ; [SSE]
mov rax, arg(5) ; [Sum]
movd DWORD PTR [rdi], xmm6
movd DWORD PTR [rax], xmm7
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;unsigned int vp9_highbd_calc8x8var_sse2
;(
; unsigned char * src_ptr,
; int source_stride,
; unsigned char * ref_ptr,
; int recon_stride,
; unsigned int * SSE,
; int * Sum
;)
global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
sym(vp9_highbd_calc8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rbx
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;[src_ptr]
mov rdi, arg(2) ;[ref_ptr]
movsxd rax, DWORD PTR arg(1) ;[source_stride]
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
add rax, rax ; source stride in bytes
add rdx, rdx ; recon stride in bytes
; Prefetch data
prefetcht0 [rsi]
prefetcht0 [rsi+rax]
lea rbx, [rsi+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
prefetcht0 [rdi]
prefetcht0 [rdi+rdx]
lea rbx, [rdi+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
pxor xmm0, xmm0 ; clear xmm0 for unpack
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
mov rcx, 8
.var8loop:
movdqu xmm1, XMMWORD PTR [rsi]
movdqu xmm2, XMMWORD PTR [rdi]
lea rbx, [rsi+rax*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
lea rbx, [rbx+rax*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rax]
lea rbx, [rdi+rdx*4]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
lea rbx, [rbx+rdx*2]
prefetcht0 [rbx]
prefetcht0 [rbx+rdx]
pxor xmm5, xmm5
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm1
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
psubw xmm3, xmm2
movdqu xmm1, XMMWORD PTR [rsi]
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
movdqu xmm2, XMMWORD PTR [rdi]
paddd xmm6, xmm3
psubw xmm1, xmm2
movdqu xmm3, XMMWORD PTR [rsi+rax]
paddw xmm5, xmm1
pmaddwd xmm1, xmm1
movdqu xmm2, XMMWORD PTR [rdi+rdx]
paddd xmm6, xmm1
psubw xmm3, xmm2
paddw xmm5, xmm3
pmaddwd xmm3, xmm3
paddd xmm6, xmm3
movdqa xmm1, xmm5
movdqa xmm2, xmm5
pcmpgtw xmm1, xmm0
pcmpeqw xmm2, xmm0
por xmm1, xmm2
pcmpeqw xmm1, xmm0
movdqa xmm2, xmm5
punpcklwd xmm5, xmm1
punpckhwd xmm2, xmm1
paddd xmm7, xmm5
paddd xmm7, xmm2
lea rsi, [rsi + 2*rax]
lea rdi, [rdi + 2*rdx]
sub rcx, 4
jnz .var8loop
movdqa xmm4, xmm6
punpckldq xmm6, xmm0
punpckhdq xmm4, xmm0
movdqa xmm5, xmm7
paddd xmm6, xmm4
punpckldq xmm7, xmm0
punpckhdq xmm5, xmm0
paddd xmm7, xmm5
movdqa xmm4, xmm6
movdqa xmm5, xmm7
psrldq xmm4, 8
psrldq xmm5, 8
paddd xmm6, xmm4
paddd xmm7, xmm5
mov rdi, arg(4) ; [SSE]
mov rax, arg(5) ; [Sum]
movd DWORD PTR [rdi], xmm6
movd DWORD PTR [rax], xmm7
; begin epilog
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

View File

@@ -0,0 +1,613 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
typedef unsigned int (*high_variance_fn_t) (const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
unsigned int *sse, int *sum);
unsigned int vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
unsigned int *sse, int *sum);
static void highbd_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
*sse = 0;
*sum = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
}
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
const uint16_t *ref, int ref_stride,
int w, int h, unsigned int *sse, int *sum,
high_variance_fn_t var_fn, int block_size) {
int i, j;
uint64_t sse_long = 0;
int64_t sum_long = 0;
for (i = 0; i < h; i += block_size) {
for (j = 0; j < w; j += block_size) {
unsigned int sse0;
int sum0;
var_fn(src + src_stride * i + j, src_stride,
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
sse_long += sse0;
sum_long += sum0;
}
}
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
}
#define HIGH_GET_VAR(S) \
void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
} \
\
void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
} \
\
void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse, int *sum) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
sse, sum); \
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
}
HIGH_GET_VAR(16);
HIGH_GET_VAR(8);
#undef HIGH_GET_VAR
#define VAR_FN(w, h, block_size, shift) \
unsigned int vp9_highbd_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
unsigned int vp9_highbd_10_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
} \
unsigned int vp9_highbd_12_variance##w##x##h##_sse2( \
const uint8_t *src8, int src_stride, \
const uint8_t *ref8, int ref_stride, \
unsigned int *sse) { \
int sum; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
block_size); \
return *sse - (((int64_t)sum * sum) >> shift); \
}
VAR_FN(64, 64, 16, 12);
VAR_FN(64, 32, 16, 11);
VAR_FN(32, 64, 16, 11);
VAR_FN(32, 32, 16, 10);
VAR_FN(32, 16, 16, 9);
VAR_FN(16, 32, 16, 9);
VAR_FN(16, 16, 16, 8);
VAR_FN(16, 8, 8, 7);
VAR_FN(8, 16, 8, 7);
VAR_FN(8, 8, 8, 6);
#undef VAR_FN
unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
return *sse;
}
unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
const uint8_t *ref8, int ref_stride,
unsigned int *sse) {
int sum;
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
return *sse;
}
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
int height, unsigned int *sse);
#define DECLS(opt1, opt2) \
DECL(8, opt1); \
DECL(16, opt1)
DECLS(sse2, sse);
// DECLS(ssse3, ssse3);
#undef DECLS
#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int \
vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, h, \
&sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
x_offset, y_offset, \
dst, dst_stride, \
h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
src_stride, \
x_offset, y_offset, \
dst + 16, \
dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr) { \
int start_row; \
unsigned int sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
for (start_row = 0; start_row < h; start_row +=16) { \
unsigned int sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
dst_stride, height, &sse2); \
se += se2; \
long_sse += sse2; \
}\
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1, opt2) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2, sse);
#undef FNS
#undef FN
#define DECL(w, opt) \
int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint16_t *dst, \
ptrdiff_t dst_stride, \
const uint16_t *sec, \
ptrdiff_t sec_stride, \
int height, \
unsigned int *sse);
#define DECLS(opt1) \
DECL(16, opt1) \
DECL(8, opt1)
DECLS(sse2);
#undef DECL
#undef DECLS
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr, \
const uint8_t *sec8) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr, \
const uint8_t *sec8) { \
unsigned int sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
sec, w, h, &sse); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
sec + 16, w, h, &sse2); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
sec + 32, w, h, &sse2); \
se += se2; \
sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
sec + 48, w, h, &sse2); \
se += se2; \
sse += sse2; \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
sse = ROUND_POWER_OF_TWO(sse, 4); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
} \
unsigned int vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
const uint8_t *src8, \
int src_stride, \
int x_offset, \
int y_offset, \
const uint8_t *dst8, \
int dst_stride, \
unsigned int *sse_ptr, \
const uint8_t *sec8) { \
int start_row; \
unsigned int sse; \
int se = 0; \
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
for (start_row = 0; start_row < h; start_row +=16) { \
unsigned int sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + (start_row * src_stride), src_stride, x_offset, \
y_offset, dst + (start_row * dst_stride), dst_stride, \
sec + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 16 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 16 + (start_row * dst_stride), dst_stride, \
sec + 16 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 32 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 32 + (start_row * dst_stride), dst_stride, \
sec + 32 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
src + 48 + (start_row * src_stride), src_stride, \
x_offset, y_offset, \
dst + 48 + (start_row * dst_stride), dst_stride, \
sec + 48 + (start_row * w), w, height, &sse2); \
se += se2; \
long_sse += sse2; \
} \
} \
} \
se = ROUND_POWER_OF_TWO(se, 4); \
sse = ROUND_POWER_OF_TWO(long_sse, 8); \
*sse_ptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
#define FNS(opt1) \
FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
FN(8, 4, 8, 3, 2, opt1, (int64_t));
FNS(sse2);
#undef FNS
#undef FN

View File

@@ -24,6 +24,7 @@ VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
VP9_CX_SRCS-yes += encoder/vp9_cost.h
VP9_CX_SRCS-yes += encoder/vp9_cost.c
VP9_CX_SRCS-yes += encoder/vp9_dct.c
VP9_CX_SRCS-yes += encoder/vp9_dct.h
VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
@@ -101,6 +102,12 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
endif
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
@@ -109,6 +116,11 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
endif
endif
ifeq ($(ARCH_X86_64),yes)
@@ -120,7 +132,9 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c

View File

@@ -18,58 +18,56 @@
#include <stdlib.h>
static void once(void (*func)(void))
{
static CRITICAL_SECTION *lock;
static LONG waiters;
static int done;
void *lock_ptr = &lock;
static CRITICAL_SECTION *lock;
static LONG waiters;
static int done;
void *lock_ptr = &lock;
/* If the initialization is complete, return early. This isn't just an
* optimization, it prevents races on the destruction of the global
* lock.
*/
if(done)
return;
/* If the initialization is complete, return early. This isn't just an
* optimization, it prevents races on the destruction of the global
* lock.
*/
if (done)
return;
InterlockedIncrement(&waiters);
InterlockedIncrement(&waiters);
/* Get a lock. We create one and try to make it the one-true-lock,
* throwing it away if we lost the race.
*/
/* Get a lock. We create one and try to make it the one-true-lock,
* throwing it away if we lost the race.
*/
{
/* Scope to protect access to new_lock */
CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
InitializeCriticalSection(new_lock);
if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
{
/* Scope to protect access to new_lock */
CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
InitializeCriticalSection(new_lock);
if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
{
DeleteCriticalSection(new_lock);
free(new_lock);
}
DeleteCriticalSection(new_lock);
free(new_lock);
}
}
/* At this point, we have a lock that can be synchronized on. We don't
* care which thread actually performed the allocation.
*/
/* At this point, we have a lock that can be synchronized on. We don't
* care which thread actually performed the allocation.
*/
EnterCriticalSection(lock);
EnterCriticalSection(lock);
if (!done)
{
func();
done = 1;
}
if (!done) {
func();
done = 1;
}
LeaveCriticalSection(lock);
LeaveCriticalSection(lock);
/* Last one out should free resources. The destructed objects are
* protected by checking if(done) above.
*/
if(!InterlockedDecrement(&waiters))
{
DeleteCriticalSection(lock);
free(lock);
lock = NULL;
}
/* Last one out should free resources. The destructed objects are
* protected by checking if(done) above.
*/
if (!InterlockedDecrement(&waiters)) {
DeleteCriticalSection(lock);
free(lock);
lock = NULL;
}
}
@@ -78,25 +76,24 @@ static void once(void (*func)(void))
#include <os2.h>
static void once(void (*func)(void))
{
static int done;
static int done;
/* If the initialization is complete, return early. */
if(done)
return;
/* If the initialization is complete, return early. */
if (done)
return;
/* Causes all other threads in the process to block themselves
* and give up their time slice.
*/
DosEnterCritSec();
/* Causes all other threads in the process to block themselves
* and give up their time slice.
*/
DosEnterCritSec();
if (!done)
{
func();
done = 1;
}
if (!done) {
func();
done = 1;
}
/* Restores normal thread dispatching for the current process. */
DosExitCritSec();
/* Restores normal thread dispatching for the current process. */
DosExitCritSec();
}
@@ -104,8 +101,8 @@ static void once(void (*func)(void))
#include <pthread.h>
static void once(void (*func)(void))
{
static pthread_once_t lock = PTHREAD_ONCE_INIT;
pthread_once(&lock, func);
static pthread_once_t lock = PTHREAD_ONCE_INIT;
pthread_once(&lock, func);
}
@@ -117,13 +114,12 @@ static void once(void (*func)(void))
static void once(void (*func)(void))
{
static int done;
static int done;
if(!done)
{
func();
done = 1;
}
if (!done) {
func();
done = 1;
}
}
#endif