Merge changes from libvpx/master by cherry-pick
This commit bring all up-to-date changes from master that are applicable to nextgenv2. Due to the remove VP10 code in master, we had to cherry pick the following commits to get those changes: Add default flags for arm64/armv8 builds Allows building simple targets with sane default flags. For example, using the Android arm64 toolchain from the NDK: https://developer.android.com/ndk/guides/standalone_toolchain.html ./build/tools/make-standalone-toolchain.sh --arch=arm64 \ --platform=android-24 --install-dir=/tmp/arm64 CROSS=/tmp/arm64/bin/aarch64-linux-android- \ ~/libvpx/configure --target=arm64-linux-gcc --disable-multithread BUG=webm:1143 vpx_lpf_horizontal_4_sse2: Remove dead load. Change-Id: I51026c52baa1f0881fcd5b68e1fdf08a2dc0916e Fail early when android target does not include --sdk-path Change-Id: I07e7e63476a2e32e3aae123abdee8b7bbbdc6a8c configure: clean up var style and set_all usage Use quotes whenever possible and {} always for variables. Replace multiple set_all calls with *able_feature(). Conflicts: build/make/configure.sh vp9-svc: Remove some unneeded code/comment. datarate_test,DatarateTestLarge: normalize bits type quiets a msvc warning: conversion from 'const int64_t' to 'size_t', possible loss of data mips added p6600 cpu support Removed -funroll-loops psnr.c: use int64_t for sum of differences Since the values can be negative. *.asm: normalize label format add a trailing ':', though it's optional with the tools we support, it's more common to use it to mark a label. this also quiets the orphan-labels warning with nasm/yasm. BUG=b/29583530 Prevent negative variance Due to rounding, hbd variance may become negative. This commit put in check and clamp of negative values to 0. configure: remove old visual studio support (<2010) BUG=b/29583530 Conflicts: configure configure: restore vs_version variable inadvertently lost in the final patchset of:078dff7
configure: remove old visual studio support (<2010) this prevents an empty CONFIG_VS_VERSION and avoids make failure Require x86inc.asm Force enable x86inc.asm when building for x86. Previously there were compatibility issues so a flag was added to simplify disabling this code. The known issues have been resolved and x86inc.asm is the preferred abstraction layer (over x86_abi_support.asm). BUG=b:29583530 convolve_test: fix byte offsets in hbd build CONVERT_TO_BYTEPTR(x) was corrected in:003a9d2
Port metric computation changes from nextgenv2 to use the more common (x) within the expansion. offsets should occur after converting the pointer to the desired type. + factorized some common expressions Conflicts: test/convolve_test.cc vpx_dsp: remove x86inc.asm distinction BUG=b:29583530 Conflicts: vpx_dsp/vpx_dsp.mk vpx_dsp/vpx_dsp_rtcd_defs.pl vpx_dsp/x86/highbd_variance_sse2.c vpx_dsp/x86/variance_sse2.c test: remove x86inc.asm distinction BUG=b:29583530 Conflicts: test/vp9_subtract_test.cc configure: remove x86inc.asm distinction BUG=b:29583530 Change-Id: I59a1192142e89a6a36b906f65a491a734e603617 Update vpx subpixel 1d filter ssse3 asm Speed test shows the new vertical filters have degradation on Celeron Chromebook. Added "X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON" to control the vertical filters activated code. Now just simply active the code without degradation on Celeron. Later there should be 2 set of vertical filters ssse3 functions, and let jump table to choose based on CPU type. improve vpx_filter_block1d* based on replace paddsw+psrlw to pmulhrsw Make set_reference control API work in VP9 Moved the API patch from NextGenv2. An example was included. To try it, for example, run the following command: $ examples/vpx_cx_set_ref vp9 352 288 in.yuv out.ivf 4 30 Conflicts: examples.mk examples/vpx_cx_set_ref.c test/cx_set_ref.sh vp9/decoder/vp9_decoder.c deblock filter : moved from vp8 code branch The deblocking filters used in vp8 have been moved to vpx_dsp for use by both vp8 and vp9. vpx_thread.[hc]: update webp source reference + drop the blob hash, the updated reference will be updated in the commit message BUG=b/29583578 vpx_thread: use native windows cond var if available BUG=b/29583578 original webp change: commit 110ad5835ecd66995d0e7f66dca1b90dea595f5a Author: James Zern <jzern@google.com> Date: Mon Nov 23 19:49:58 2015 -0800 thread: use native windows cond var if available Vista / Server 2008 and up. no speed difference observed. 100644 blob 4fc372b7bc6980a9ed3618c8cce5b67ed7b0f412 src/utils/thread.c 100644 blob 840831185502d42a3246e4b7ff870121c8064791 src/utils/thread.h vpx_thread: use InitializeCriticalSectionEx if available BUG=b/29583578 original webp change: commit 63fadc9ffacc77d4617526a50c696d21d558a70b Author: James Zern <jzern@google.com> Date: Mon Nov 23 20:38:46 2015 -0800 thread: use InitializeCriticalSectionEx if available Windows Vista / Server 2008 and up 100644 blob f84207d89b3a6bb98bfe8f3fa55cad72dfd061ff src/utils/thread.c 100644 blob 840831185502d42a3246e4b7ff870121c8064791 src/utils/thread.h vpx_thread: use WaitForSingleObjectEx if available BUG=b/29583578 original webp change: commit 0fd0e12bfe83f16ce4f1c038b251ccbc13c62ac2 Author: James Zern <jzern@google.com> Date: Mon Nov 23 20:40:26 2015 -0800 thread: use WaitForSingleObjectEx if available Windows XP and up 100644 blob d58f74e5523dbc985fc531cf5f0833f1e9157cf0 src/utils/thread.c 100644 blob 840831185502d42a3246e4b7ff870121c8064791 src/utils/thread.h vpx_thread: use CreateThread for windows phone BUG=b/29583578 original webp change: commit d2afe974f9d751de144ef09d31255aea13b442c0 Author: James Zern <jzern@google.com> Date: Mon Nov 23 20:41:26 2015 -0800 thread: use CreateThread for windows phone _beginthreadex is unavailable for winrt/uwp Change-Id: Ie7412a568278ac67f0047f1764e2521193d74d4d 100644 blob 93f7622797f05f6acc1126e8296c481d276e4047 src/utils/thread.c 100644 blob 840831185502d42a3246e4b7ff870121c8064791 src/utils/thread.h vp9_postproc.c missing extern. BUG=webm:1256 deblock: missing const on extern const. postproc - move filling of noise buffer to vpx_dsp. Fix encoder crashes for odd size input clean-up vp9_intrapred_test remove tuple and overkill VP9IntraPredBase class. postproc: noise style fixes. gtest-all.cc: quiet an unused variable warning under windows / mingw builds vp9_intrapred_test: follow-up cleanup address few comments fromce050afaf3
Change-Id: I3eece7efa9335f4210303993ef6c1857ad5c29c8
This commit is contained in:
@@ -186,24 +186,6 @@ add_extralibs() {
|
||||
# Boolean Manipulation Functions
|
||||
#
|
||||
|
||||
enable_codec(){
|
||||
enabled $1 || echo " enabling $1"
|
||||
set_all yes $1
|
||||
|
||||
is_in $1 vp8 vp9 vp10 && \
|
||||
set_all yes $1_encoder && \
|
||||
set_all yes $1_decoder
|
||||
}
|
||||
|
||||
disable_codec(){
|
||||
disabled $1 || echo " disabling $1"
|
||||
set_all no $1
|
||||
|
||||
is_in $1 vp8 vp9 vp10 && \
|
||||
set_all no $1_encoder && \
|
||||
set_all no $1_decoder
|
||||
}
|
||||
|
||||
enable_feature(){
|
||||
set_all yes $*
|
||||
}
|
||||
@@ -220,6 +202,20 @@ disabled(){
|
||||
eval test "x\$$1" = "xno"
|
||||
}
|
||||
|
||||
enable_codec(){
|
||||
enabled "${1}" || echo " enabling ${1}"
|
||||
enable_feature "${1}"
|
||||
|
||||
is_in "${1}" vp8 vp9 vp10 && enable_feature "${1}_encoder" "${1}_decoder"
|
||||
}
|
||||
|
||||
disable_codec(){
|
||||
disabled "${1}" || echo " disabling ${1}"
|
||||
disable_feature "${1}"
|
||||
|
||||
is_in "${1}" vp8 vp9 vp10 && disable_feature "${1}_encoder" "${1}_decoder"
|
||||
}
|
||||
|
||||
# Iterates through positional parameters, checks to confirm the parameter has
|
||||
# not been explicitly (force) disabled, and enables the setting controlled by
|
||||
# the parameter when the setting is not disabled.
|
||||
@@ -945,6 +941,9 @@ EOF
|
||||
check_add_cflags -mfpu=neon #-ftree-vectorize
|
||||
check_add_asflags -mfpu=neon
|
||||
fi
|
||||
elif [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
|
||||
check_add_cflags -march=armv8-a
|
||||
check_add_asflags -march=armv8-a
|
||||
else
|
||||
check_add_cflags -march=${tgt_isa}
|
||||
check_add_asflags -march=${tgt_isa}
|
||||
@@ -1012,6 +1011,10 @@ EOF
|
||||
;;
|
||||
|
||||
android*)
|
||||
if [ -z "${sdk_path}" ]; then
|
||||
die "Must specify --sdk-path for Android builds."
|
||||
fi
|
||||
|
||||
SDK_PATH=${sdk_path}
|
||||
COMPILER_LOCATION=`find "${SDK_PATH}" \
|
||||
-name "arm-linux-androideabi-gcc*" -print -quit`
|
||||
@@ -1150,13 +1153,13 @@ EOF
|
||||
if [ -n "${tune_cpu}" ]; then
|
||||
case ${tune_cpu} in
|
||||
p5600)
|
||||
check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
|
||||
check_add_cflags -mips32r5 -mload-store-pairs
|
||||
check_add_cflags -msched-weight -mhard-float -mfp64
|
||||
check_add_asflags -mips32r5 -mhard-float -mfp64
|
||||
check_add_ldflags -mfp64
|
||||
;;
|
||||
i6400)
|
||||
check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight
|
||||
i6400|p6600)
|
||||
check_add_cflags -mips64r6 -mabi=64 -msched-weight
|
||||
check_add_cflags -mload-store-pairs -mhard-float -mfp64
|
||||
check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
|
||||
check_add_ldflags -mips64r6 -mabi=64 -mfp64
|
||||
@@ -1393,10 +1396,6 @@ EOF
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "${tgt_isa}" = "x86_64" ] || [ "${tgt_isa}" = "x86" ]; then
|
||||
soft_enable use_x86inc
|
||||
fi
|
||||
|
||||
# Position Independent Code (PIC) support, for building relocatable
|
||||
# shared objects
|
||||
enabled gcc && enabled pic && check_add_cflags -fPIC
|
||||
|
7
configure
vendored
7
configure
vendored
@@ -98,11 +98,11 @@ EOF
|
||||
|
||||
# all_platforms is a list of all supported target platforms. Maintain
|
||||
# alphabetically by architecture, generic-gnu last.
|
||||
all_platforms="${all_platforms} arm64-darwin-gcc"
|
||||
all_platforms="${all_platforms} arm64-linux-gcc"
|
||||
all_platforms="${all_platforms} armv6-linux-rvct"
|
||||
all_platforms="${all_platforms} armv6-linux-gcc"
|
||||
all_platforms="${all_platforms} armv6-none-rvct"
|
||||
all_platforms="${all_platforms} arm64-darwin-gcc"
|
||||
all_platforms="${all_platforms} arm64-linux-gcc"
|
||||
all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8
|
||||
all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8
|
||||
all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8
|
||||
@@ -112,6 +112,7 @@ all_platforms="${all_platforms} armv7-win32-vs11"
|
||||
all_platforms="${all_platforms} armv7-win32-vs12"
|
||||
all_platforms="${all_platforms} armv7-win32-vs14"
|
||||
all_platforms="${all_platforms} armv7s-darwin-gcc"
|
||||
all_platforms="${all_platforms} armv8-linux-gcc"
|
||||
all_platforms="${all_platforms} mips32-linux-gcc"
|
||||
all_platforms="${all_platforms} mips64-linux-gcc"
|
||||
all_platforms="${all_platforms} sparc-solaris-gcc"
|
||||
@@ -293,7 +294,6 @@ CONFIG_LIST="
|
||||
install_bins
|
||||
install_libs
|
||||
install_srcs
|
||||
use_x86inc
|
||||
debug
|
||||
gprof
|
||||
gcov
|
||||
@@ -355,7 +355,6 @@ CMDLINE_SELECT="
|
||||
gprof
|
||||
gcov
|
||||
pic
|
||||
use_x86inc
|
||||
optimizations
|
||||
ccache
|
||||
runtime_cpu_detect
|
||||
|
@@ -13,6 +13,7 @@
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_dsp/postproc.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
namespace {
|
||||
@@ -40,50 +41,6 @@ double stddev6(char a, char b, char c, char d, char e, char f) {
|
||||
return sqrt(v);
|
||||
}
|
||||
|
||||
// TODO(jimbankoski): The following 2 functions are duplicated in each codec.
|
||||
// For now the vp9 one has been copied into the test as is. We should normalize
|
||||
// these in vpx_dsp and not have 3 copies of these unless there is different
|
||||
// noise we add for each codec.
|
||||
|
||||
double gaussian(double sigma, double mu, double x) {
|
||||
return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
|
||||
(exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
|
||||
}
|
||||
|
||||
int setup_noise(int size_noise, char *noise) {
|
||||
char char_dist[300];
|
||||
const int ai = 4;
|
||||
const int qi = 24;
|
||||
const double sigma = ai + .5 + .6 * (63 - qi) / 63.0;
|
||||
|
||||
/* set up a lookup table of 256 entries that matches
|
||||
* a gaussian distribution with sigma determined by q.
|
||||
*/
|
||||
int next = 0;
|
||||
|
||||
for (int i = -32; i < 32; i++) {
|
||||
int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
|
||||
|
||||
if (a_i) {
|
||||
for (int j = 0; j < a_i; j++) {
|
||||
char_dist[next + j] = (char)(i);
|
||||
}
|
||||
|
||||
next = next + a_i;
|
||||
}
|
||||
}
|
||||
|
||||
for (; next < 256; next++)
|
||||
char_dist[next] = 0;
|
||||
|
||||
for (int i = 0; i < size_noise; i++) {
|
||||
noise[i] = char_dist[rand() & 0xff]; // NOLINT
|
||||
}
|
||||
|
||||
// Returns the most negative value in distribution.
|
||||
return char_dist[0];
|
||||
}
|
||||
|
||||
TEST_P(AddNoiseTest, CheckNoiseAdded) {
|
||||
DECLARE_ALIGNED(16, char, blackclamp[16]);
|
||||
DECLARE_ALIGNED(16, char, whiteclamp[16]);
|
||||
@@ -92,12 +49,12 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
|
||||
const int height = 64;
|
||||
const int image_size = width * height;
|
||||
char noise[3072];
|
||||
const int clamp = vpx_setup_noise(4.4, sizeof(noise), noise);
|
||||
|
||||
const int clamp = setup_noise(3072, noise);
|
||||
for (int i = 0; i < 16; i++) {
|
||||
blackclamp[i] = -clamp;
|
||||
whiteclamp[i] = -clamp;
|
||||
bothclamp[i] = -2 * clamp;
|
||||
blackclamp[i] = clamp;
|
||||
whiteclamp[i] = clamp;
|
||||
bothclamp[i] = 2 * clamp;
|
||||
}
|
||||
|
||||
uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
|
||||
@@ -127,7 +84,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
|
||||
|
||||
// Check to make sure don't roll over.
|
||||
for (int i = 0; i < image_size; ++i) {
|
||||
EXPECT_GT((int)s[i], 10) << "i = " << i;
|
||||
EXPECT_GT(static_cast<int>(s[i]), clamp) << "i = " << i;
|
||||
}
|
||||
|
||||
// Initialize pixels in the image to 0 and check for roll under.
|
||||
@@ -138,7 +95,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
|
||||
|
||||
// Check to make sure don't roll under.
|
||||
for (int i = 0; i < image_size; ++i) {
|
||||
EXPECT_LT((int)s[i], 245) << "i = " << i;
|
||||
EXPECT_LT(static_cast<int>(s[i]), 255 - clamp) << "i = " << i;
|
||||
}
|
||||
|
||||
vpx_free(s);
|
||||
@@ -153,11 +110,12 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
|
||||
const int image_size = width * height;
|
||||
char noise[3072];
|
||||
|
||||
const int clamp = setup_noise(3072, noise);
|
||||
const int clamp = vpx_setup_noise(4.4, sizeof(noise), noise);
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
blackclamp[i] = -clamp;
|
||||
whiteclamp[i] = -clamp;
|
||||
bothclamp[i] = -2 * clamp;
|
||||
blackclamp[i] = clamp;
|
||||
whiteclamp[i] = clamp;
|
||||
bothclamp[i] = 2 * clamp;
|
||||
}
|
||||
|
||||
uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
|
||||
@@ -175,7 +133,7 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
|
||||
width, height, width));
|
||||
|
||||
for (int i = 0; i < image_size; ++i) {
|
||||
EXPECT_EQ((int)s[i], (int)d[i]) << "i = " << i;
|
||||
EXPECT_EQ(static_cast<int>(s[i]), static_cast<int>(d[i])) << "i = " << i;
|
||||
}
|
||||
|
||||
vpx_free(d);
|
||||
|
@@ -453,7 +453,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
|
||||
memcpy(output_ref_, output_, kOutputBufferSize);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
memcpy(output16_ref_, output16_,
|
||||
kOutputBufferSize * sizeof(*output16_ref_));
|
||||
kOutputBufferSize * sizeof(output16_ref_[0]));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -465,41 +465,41 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
|
||||
}
|
||||
|
||||
uint8_t *input() const {
|
||||
const int index = BorderTop() * kOuterBlockSize + BorderLeft();
|
||||
const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (UUT_->use_highbd_ == 0) {
|
||||
return input_ + index;
|
||||
return input_ + offset;
|
||||
} else {
|
||||
return CONVERT_TO_BYTEPTR(input16_) + index;
|
||||
return CONVERT_TO_BYTEPTR(input16_) + offset;
|
||||
}
|
||||
#else
|
||||
return input_ + index;
|
||||
return input_ + offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint8_t *output() const {
|
||||
const int index = BorderTop() * kOuterBlockSize + BorderLeft();
|
||||
const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (UUT_->use_highbd_ == 0) {
|
||||
return output_ + index;
|
||||
return output_ + offset;
|
||||
} else {
|
||||
return CONVERT_TO_BYTEPTR(output16_ + index);
|
||||
return CONVERT_TO_BYTEPTR(output16_) + offset;
|
||||
}
|
||||
#else
|
||||
return output_ + index;
|
||||
return output_ + offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint8_t *output_ref() const {
|
||||
const int index = BorderTop() * kOuterBlockSize + BorderLeft();
|
||||
const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (UUT_->use_highbd_ == 0) {
|
||||
return output_ref_ + index;
|
||||
return output_ref_ + offset;
|
||||
} else {
|
||||
return CONVERT_TO_BYTEPTR(output16_ref_ + index);
|
||||
return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
|
||||
}
|
||||
#else
|
||||
return output_ref_ + index;
|
||||
return output_ref_ + offset;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1011,14 +1011,12 @@ void wrap_ ## func ## _ ## bd(const uint8_t *src, ptrdiff_t src_stride, \
|
||||
w, h, bd); \
|
||||
}
|
||||
#if HAVE_SSE2 && ARCH_X86_64
|
||||
#if CONFIG_USE_X86INC
|
||||
WRAP(convolve_copy_sse2, 8)
|
||||
WRAP(convolve_avg_sse2, 8)
|
||||
WRAP(convolve_copy_sse2, 10)
|
||||
WRAP(convolve_avg_sse2, 10)
|
||||
WRAP(convolve_copy_sse2, 12)
|
||||
WRAP(convolve_avg_sse2, 12)
|
||||
#endif // CONFIG_USE_X86INC
|
||||
WRAP(convolve8_horiz_sse2, 8)
|
||||
WRAP(convolve8_avg_horiz_sse2, 8)
|
||||
WRAP(convolve8_vert_sse2, 8)
|
||||
@@ -1112,11 +1110,7 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest,
|
||||
#if HAVE_SSE2 && ARCH_X86_64
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const ConvolveFunctions convolve8_sse2(
|
||||
#if CONFIG_USE_X86INC
|
||||
wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
|
||||
#else
|
||||
wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
|
||||
#endif // CONFIG_USE_X86INC
|
||||
wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
|
||||
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
|
||||
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
|
||||
@@ -1124,11 +1118,7 @@ const ConvolveFunctions convolve8_sse2(
|
||||
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
|
||||
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
|
||||
const ConvolveFunctions convolve10_sse2(
|
||||
#if CONFIG_USE_X86INC
|
||||
wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
|
||||
#else
|
||||
wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
|
||||
#endif // CONFIG_USE_X86INC
|
||||
wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
|
||||
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
|
||||
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
|
||||
@@ -1136,11 +1126,7 @@ const ConvolveFunctions convolve10_sse2(
|
||||
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
|
||||
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
|
||||
const ConvolveFunctions convolve12_sse2(
|
||||
#if CONFIG_USE_X86INC
|
||||
wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
|
||||
#else
|
||||
wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
|
||||
#endif // CONFIG_USE_X86INC
|
||||
wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
|
||||
wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
|
||||
wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
|
||||
@@ -1154,11 +1140,7 @@ const ConvolveParam kArrayConvolve_sse2[] = {
|
||||
};
|
||||
#else
|
||||
const ConvolveFunctions convolve8_sse2(
|
||||
#if CONFIG_USE_X86INC
|
||||
vpx_convolve_copy_sse2, vpx_convolve_avg_sse2,
|
||||
#else
|
||||
vpx_convolve_copy_c, vpx_convolve_avg_c,
|
||||
#endif // CONFIG_USE_X86INC
|
||||
vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2,
|
||||
vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2,
|
||||
vpx_convolve8_sse2, vpx_convolve8_avg_sse2,
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#!/bin/sh
|
||||
##
|
||||
## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
## Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
##
|
||||
## Use of this source code is governed by a BSD-style license
|
||||
## that can be found in the LICENSE file in the root of the source
|
||||
|
@@ -135,7 +135,7 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
|
||||
double duration_;
|
||||
double file_datarate_;
|
||||
double effective_datarate_;
|
||||
size_t bits_in_last_frame_;
|
||||
int64_t bits_in_last_frame_;
|
||||
int denoiser_on_;
|
||||
int denoiser_offon_test_;
|
||||
int denoiser_offon_period_;
|
||||
|
@@ -302,7 +302,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8, 16)));
|
||||
#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
|
||||
#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, Trans4x4WHT,
|
||||
::testing::Values(
|
||||
|
@@ -766,7 +766,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
&idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
|
||||
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
|
||||
#if HAVE_SSSE3 && ARCH_X86_64 && \
|
||||
!CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSSE3, FwdTrans8x8DCT,
|
||||
|
@@ -152,10 +152,10 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
|
||||
::testing::Values(&vpx_hadamard_8x8_sse2));
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
|
||||
#if HAVE_SSSE3 && ARCH_X86_64
|
||||
INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
|
||||
::testing::Values(&vpx_hadamard_8x8_ssse3));
|
||||
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
|
||||
#endif // HAVE_SSSE3 && ARCH_X86_64
|
||||
|
||||
#if HAVE_NEON
|
||||
INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
|
||||
|
@@ -295,7 +295,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
TX_4X4, 1)));
|
||||
#endif
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
|
||||
#if HAVE_SSSE3 && ARCH_X86_64 && \
|
||||
!CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSSE3_64, PartialIDctTest,
|
||||
|
@@ -11,7 +11,7 @@
|
||||
#include "test/register_state_check.h"
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
#include "./vpx_config.h"
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
@@ -25,7 +25,7 @@ typedef void (*PostProcFunc)(unsigned char *src_ptr,
|
||||
|
||||
namespace {
|
||||
|
||||
class VP8PostProcessingFilterTest
|
||||
class VPxPostProcessingFilterTest
|
||||
: public ::testing::TestWithParam<PostProcFunc> {
|
||||
public:
|
||||
virtual void TearDown() {
|
||||
@@ -33,10 +33,10 @@ class VP8PostProcessingFilterTest
|
||||
}
|
||||
};
|
||||
|
||||
// Test routine for the VP8 post-processing function
|
||||
// vp8_post_proc_down_and_across_mb_row_c.
|
||||
// Test routine for the VPx post-processing function
|
||||
// vpx_post_proc_down_and_across_mb_row_c.
|
||||
|
||||
TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
|
||||
TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
|
||||
// Size of the underlying data block that will be filtered.
|
||||
const int block_width = 16;
|
||||
const int block_height = 16;
|
||||
@@ -92,7 +92,7 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
|
||||
for (int i = 0; i < block_height; ++i) {
|
||||
for (int j = 0; j < block_width; ++j) {
|
||||
EXPECT_EQ(expected_data[i], pixel_ptr[j])
|
||||
<< "VP8PostProcessingFilterTest failed with invalid filter output";
|
||||
<< "VPxPostProcessingFilterTest failed with invalid filter output";
|
||||
}
|
||||
pixel_ptr += output_stride;
|
||||
}
|
||||
@@ -102,17 +102,17 @@ TEST_P(VP8PostProcessingFilterTest, FilterOutputCheck) {
|
||||
vpx_free(flimits);
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(C, VP8PostProcessingFilterTest,
|
||||
::testing::Values(vp8_post_proc_down_and_across_mb_row_c));
|
||||
INSTANTIATE_TEST_CASE_P(C, VPxPostProcessingFilterTest,
|
||||
::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
|
||||
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, VP8PostProcessingFilterTest,
|
||||
::testing::Values(vp8_post_proc_down_and_across_mb_row_sse2));
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, VPxPostProcessingFilterTest,
|
||||
::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
|
||||
#endif
|
||||
|
||||
#if HAVE_MSA
|
||||
INSTANTIATE_TEST_CASE_P(MSA, VP8PostProcessingFilterTest,
|
||||
::testing::Values(vp8_post_proc_down_and_across_mb_row_msa));
|
||||
INSTANTIATE_TEST_CASE_P(MSA, VPxPostProcessingFilterTest,
|
||||
::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
@@ -750,7 +750,6 @@ INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
|
||||
//------------------------------------------------------------------------------
|
||||
// x86 functions
|
||||
#if HAVE_SSE2
|
||||
#if CONFIG_USE_X86INC
|
||||
const SadMxNParam sse2_tests[] = {
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(128, 128, &vpx_sad128x128_sse2, -1),
|
||||
@@ -927,7 +926,6 @@ const SadMxNx4Param x4d_sse2_tests[] = {
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
|
||||
#endif // CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSE3
|
||||
|
@@ -187,21 +187,21 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
|
||||
vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
|
||||
vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSE2
|
||||
INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
|
||||
vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
|
||||
vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
|
||||
vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
|
||||
NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
|
||||
vpx_tm_predictor_4x4_sse2)
|
||||
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSSE3
|
||||
INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL,
|
||||
vpx_d153_predictor_4x4_ssse3, NULL,
|
||||
vpx_d63_predictor_4x4_ssse3, NULL)
|
||||
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSSE3
|
||||
|
||||
#if HAVE_DSPR2
|
||||
INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
|
||||
@@ -237,20 +237,20 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
|
||||
vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
|
||||
vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSE2
|
||||
INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
|
||||
vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
|
||||
vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
|
||||
vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
|
||||
NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
|
||||
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSSE3
|
||||
INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, NULL, NULL, NULL,
|
||||
vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
|
||||
vpx_d63_predictor_8x8_ssse3, NULL)
|
||||
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSSE3
|
||||
|
||||
#if HAVE_DSPR2
|
||||
INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
|
||||
@@ -286,22 +286,22 @@ INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
|
||||
vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c,
|
||||
vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c)
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSE2
|
||||
INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
|
||||
vpx_dc_left_predictor_16x16_sse2,
|
||||
vpx_dc_top_predictor_16x16_sse2,
|
||||
vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
|
||||
vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
|
||||
vpx_tm_predictor_16x16_sse2)
|
||||
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSSE3
|
||||
INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, vpx_d45_predictor_16x16_ssse3,
|
||||
NULL, NULL, vpx_d153_predictor_16x16_ssse3,
|
||||
vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
|
||||
NULL)
|
||||
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSSE3
|
||||
|
||||
#if HAVE_DSPR2
|
||||
INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
|
||||
@@ -337,21 +337,21 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
|
||||
vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
|
||||
vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
|
||||
|
||||
#if HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSE2
|
||||
INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
|
||||
vpx_dc_left_predictor_32x32_sse2,
|
||||
vpx_dc_top_predictor_32x32_sse2,
|
||||
vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
|
||||
vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, vpx_tm_predictor_32x32_sse2)
|
||||
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#if HAVE_SSSE3
|
||||
INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
|
||||
NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL,
|
||||
vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
|
||||
vpx_d63_predictor_32x32_ssse3, NULL)
|
||||
#endif // HAVE_SSSE3 && CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSSE3
|
||||
|
||||
#if HAVE_NEON
|
||||
INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
|
||||
|
@@ -217,7 +217,6 @@ class VarianceTest
|
||||
: public ::testing::TestWithParam<tuple<int, int,
|
||||
VarianceFunctionType, int> > {
|
||||
public:
|
||||
typedef tuple<int, int, VarianceFunctionType, int> ParamType;
|
||||
virtual void SetUp() {
|
||||
const tuple<int, int, VarianceFunctionType, int>& params = this->GetParam();
|
||||
log2width_ = get<0>(params);
|
||||
@@ -766,77 +765,53 @@ INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
|
||||
make_tuple(3, 4, &vpx_mse8x16_c),
|
||||
make_tuple(3, 3, &vpx_mse8x8_c)));
|
||||
|
||||
const VpxVarianceTest::ParamType kArrayVariance_c[] = {
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_variance128x128_c, 0),
|
||||
make_tuple(7, 6, &vpx_variance128x64_c, 0),
|
||||
make_tuple(6, 7, &vpx_variance64x128_c, 0),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_variance64x64_c, 0),
|
||||
make_tuple(6, 5, &vpx_variance64x32_c, 0),
|
||||
make_tuple(5, 6, &vpx_variance32x64_c, 0),
|
||||
make_tuple(5, 5, &vpx_variance32x32_c, 0),
|
||||
make_tuple(5, 4, &vpx_variance32x16_c, 0),
|
||||
make_tuple(4, 5, &vpx_variance16x32_c, 0),
|
||||
make_tuple(4, 4, &vpx_variance16x16_c, 0),
|
||||
make_tuple(4, 3, &vpx_variance16x8_c, 0),
|
||||
make_tuple(3, 4, &vpx_variance8x16_c, 0),
|
||||
make_tuple(3, 3, &vpx_variance8x8_c, 0),
|
||||
make_tuple(3, 2, &vpx_variance8x4_c, 0),
|
||||
make_tuple(2, 3, &vpx_variance4x8_c, 0),
|
||||
make_tuple(2, 2, &vpx_variance4x4_c, 0)
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VpxVarianceTest,
|
||||
::testing::ValuesIn(kArrayVariance_c));
|
||||
::testing::Values(make_tuple(6, 6, &vpx_variance64x64_c, 0),
|
||||
make_tuple(6, 5, &vpx_variance64x32_c, 0),
|
||||
make_tuple(5, 6, &vpx_variance32x64_c, 0),
|
||||
make_tuple(5, 5, &vpx_variance32x32_c, 0),
|
||||
make_tuple(5, 4, &vpx_variance32x16_c, 0),
|
||||
make_tuple(4, 5, &vpx_variance16x32_c, 0),
|
||||
make_tuple(4, 4, &vpx_variance16x16_c, 0),
|
||||
make_tuple(4, 3, &vpx_variance16x8_c, 0),
|
||||
make_tuple(3, 4, &vpx_variance8x16_c, 0),
|
||||
make_tuple(3, 3, &vpx_variance8x8_c, 0),
|
||||
make_tuple(3, 2, &vpx_variance8x4_c, 0),
|
||||
make_tuple(2, 3, &vpx_variance4x8_c, 0),
|
||||
make_tuple(2, 2, &vpx_variance4x4_c, 0)));
|
||||
|
||||
const VpxSubpelVarianceTest::ParamType kArraySubpelVariance_c[] = {
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_sub_pixel_variance128x128_c, 0),
|
||||
make_tuple(7, 6, &vpx_sub_pixel_variance128x64_c, 0),
|
||||
make_tuple(6, 7, &vpx_sub_pixel_variance64x128_c, 0),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
|
||||
make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
|
||||
make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
|
||||
make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
|
||||
make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
|
||||
make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
|
||||
make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
|
||||
make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
|
||||
make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
|
||||
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
|
||||
make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
|
||||
make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
|
||||
make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VpxSubpelVarianceTest,
|
||||
::testing::ValuesIn(kArraySubpelVariance_c));
|
||||
::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
|
||||
make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
|
||||
make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
|
||||
make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
|
||||
make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
|
||||
make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
|
||||
make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
|
||||
make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
|
||||
make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
|
||||
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
|
||||
make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
|
||||
make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
|
||||
make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
|
||||
|
||||
const VpxSubpelAvgVarianceTest::ParamType kArraySubpelAvgVariance_c[] = {
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_sub_pixel_avg_variance128x128_c, 0),
|
||||
make_tuple(7, 6, &vpx_sub_pixel_avg_variance128x64_c, 0),
|
||||
make_tuple(6, 7, &vpx_sub_pixel_avg_variance64x128_c, 0),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
|
||||
make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
|
||||
make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
|
||||
make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
|
||||
make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
|
||||
make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
|
||||
make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
|
||||
make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
|
||||
make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
|
||||
make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
|
||||
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
|
||||
make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
|
||||
make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VpxSubpelAvgVarianceTest,
|
||||
::testing::ValuesIn(kArraySubpelAvgVariance_c));
|
||||
::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
|
||||
make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
|
||||
make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
|
||||
make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
|
||||
make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
|
||||
make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
|
||||
make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
|
||||
make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
|
||||
make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
|
||||
make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
|
||||
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
|
||||
make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
|
||||
make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
|
||||
@@ -872,73 +847,70 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
|
||||
*/
|
||||
|
||||
const VpxHBDVarianceTest::ParamType kArrayHBDVariance_c[] = {
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_highbd_12_variance128x128_c, 12),
|
||||
make_tuple(7, 6, &vpx_highbd_12_variance128x64_c, 12),
|
||||
make_tuple(6, 7, &vpx_highbd_12_variance64x128_c, 12),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
|
||||
make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
|
||||
make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
|
||||
make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
|
||||
make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
|
||||
make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
|
||||
make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
|
||||
make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
|
||||
make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
|
||||
make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
|
||||
make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
|
||||
make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
|
||||
make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_highbd_10_variance128x128_c, 10),
|
||||
make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
|
||||
make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
|
||||
make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
|
||||
make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
|
||||
make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
|
||||
make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
|
||||
make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
|
||||
make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
|
||||
make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
|
||||
make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
|
||||
make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
|
||||
make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
|
||||
make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
|
||||
make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_highbd_8_variance128x128_c, 8),
|
||||
make_tuple(7, 6, &vpx_highbd_8_variance128x64_c, 8),
|
||||
make_tuple(6, 7, &vpx_highbd_8_variance64x128_c, 8),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
|
||||
make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
|
||||
make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
|
||||
make_tuple(5, 5, &vpx_highbd_8_variance32x32_c, 8),
|
||||
make_tuple(5, 4, &vpx_highbd_8_variance32x16_c, 8),
|
||||
make_tuple(4, 5, &vpx_highbd_8_variance16x32_c, 8),
|
||||
make_tuple(4, 4, &vpx_highbd_8_variance16x16_c, 8),
|
||||
make_tuple(4, 3, &vpx_highbd_8_variance16x8_c, 8),
|
||||
make_tuple(3, 4, &vpx_highbd_8_variance8x16_c, 8),
|
||||
make_tuple(3, 3, &vpx_highbd_8_variance8x8_c, 8),
|
||||
make_tuple(3, 2, &vpx_highbd_8_variance8x4_c, 8),
|
||||
make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
|
||||
make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VpxHBDVarianceTest,
|
||||
::testing::ValuesIn(kArrayHBDVariance_c));
|
||||
::testing::Values(
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_highbd_12_variance128x128_c, 12),
|
||||
make_tuple(7, 6, &vpx_highbd_12_variance128x64_c, 12),
|
||||
make_tuple(6, 7, &vpx_highbd_12_variance64x128_c, 12),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
|
||||
make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
|
||||
make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
|
||||
make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
|
||||
make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
|
||||
make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
|
||||
make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
|
||||
make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
|
||||
make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
|
||||
make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
|
||||
make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
|
||||
make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
|
||||
make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_highbd_10_variance128x128_c, 10),
|
||||
make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
|
||||
make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
|
||||
make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
|
||||
make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
|
||||
make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
|
||||
make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
|
||||
make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
|
||||
make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
|
||||
make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
|
||||
make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
|
||||
make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
|
||||
make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
|
||||
make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
|
||||
make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
|
||||
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(7, 7, &vpx_highbd_8_variance128x128_c, 8),
|
||||
make_tuple(7, 6, &vpx_highbd_8_variance128x64_c, 8),
|
||||
make_tuple(6, 7, &vpx_highbd_8_variance64x128_c, 8),
|
||||
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
|
||||
make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
|
||||
make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
|
||||
make_tuple(5, 5, &vpx_highbd_8_variance32x32_c, 8),
|
||||
make_tuple(5, 4, &vpx_highbd_8_variance32x16_c, 8),
|
||||
make_tuple(4, 5, &vpx_highbd_8_variance16x32_c, 8),
|
||||
make_tuple(4, 4, &vpx_highbd_8_variance16x16_c, 8),
|
||||
make_tuple(4, 3, &vpx_highbd_8_variance16x8_c, 8),
|
||||
make_tuple(3, 4, &vpx_highbd_8_variance8x16_c, 8),
|
||||
make_tuple(3, 3, &vpx_highbd_8_variance8x8_c, 8),
|
||||
make_tuple(3, 2, &vpx_highbd_8_variance8x4_c, 8),
|
||||
make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
|
||||
make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
|
||||
|
||||
#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE4_1, VpxHBDVarianceTest,
|
||||
::testing::Values(
|
||||
make_tuple(2, 2, &vpx_highbd_8_variance4x4_sse4_1, 8),
|
||||
make_tuple(2, 2, &vpx_highbd_10_variance4x4_sse4_1, 10),
|
||||
make_tuple(2, 2, &vpx_highbd_12_variance4x4_sse4_1, 12)));
|
||||
::testing::Values(make_tuple(2, 2, &vpx_highbd_8_variance4x4_sse4_1, 8),
|
||||
make_tuple(2, 2, &vpx_highbd_10_variance4x4_sse4_1, 10),
|
||||
make_tuple(2, 2, &vpx_highbd_12_variance4x4_sse4_1, 12)));
|
||||
#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
const VpxHBDSubpelVarianceTest::ParamType kArrayHBDSubpelVariance_c[] = {
|
||||
@@ -995,7 +967,7 @@ const VpxHBDSubpelVarianceTest::ParamType kArrayHBDSubpelVariance_c[] = {
|
||||
make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
|
||||
make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
|
||||
make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
|
||||
make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)
|
||||
make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12),
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
C, VpxHBDSubpelVarianceTest,
|
||||
@@ -1088,7 +1060,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(2, 3, &vpx_variance4x8_sse2, 0),
|
||||
make_tuple(2, 2, &vpx_variance4x4_sse2, 0)));
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, VpxSubpelVarianceTest,
|
||||
::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
|
||||
@@ -1121,7 +1092,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
|
||||
make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
|
||||
make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
|
||||
#endif // CONFIG_USE_X86INC
|
||||
|
||||
#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
@@ -1190,7 +1160,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(3, 4, &vpx_highbd_8_variance8x16_sse2, 8),
|
||||
make_tuple(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, VpxHBDSubpelVarianceTest,
|
||||
::testing::Values(
|
||||
@@ -1264,12 +1233,10 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
|
||||
make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
|
||||
make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
|
||||
#endif // CONFIG_USE_X86INC
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_SSSE3
|
||||
#if CONFIG_USE_X86INC
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSSE3, VpxSubpelVarianceTest,
|
||||
::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
|
||||
@@ -1302,7 +1269,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
|
||||
make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
|
||||
make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
|
||||
#endif // CONFIG_USE_X86INC
|
||||
#endif // HAVE_SSSE3
|
||||
|
||||
#if HAVE_AVX2
|
||||
|
@@ -157,9 +157,9 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
|
||||
#if HAVE_SSE2 || HAVE_AVX
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
|
||||
const tran_low_t *dqcoeff,
|
||||
intptr_t block_size,
|
||||
@@ -167,6 +167,7 @@ int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
|
||||
EXPECT_EQ(8, bps);
|
||||
return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
|
||||
}
|
||||
#endif // HAVE_SSE2 || HAVE_AVX
|
||||
|
||||
#if HAVE_SSE2
|
||||
int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
|
||||
@@ -206,6 +207,5 @@ INSTANTIATE_TEST_CASE_P(
|
||||
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
|
||||
#endif // HAVE_AVX
|
||||
|
||||
#endif // CONFIG_USE_X86INC
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
} // namespace
|
||||
|
@@ -28,45 +28,43 @@ using libvpx_test::ACMRandom;
|
||||
|
||||
const int count_test_block = 100000;
|
||||
|
||||
// Base class for VP9 intra prediction tests.
|
||||
class VP9IntraPredBase {
|
||||
typedef void (*IntraPred)(uint16_t* dst, ptrdiff_t stride,
|
||||
const uint16_t* above, const uint16_t* left,
|
||||
int bps);
|
||||
|
||||
struct IntraPredFunc {
|
||||
IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL,
|
||||
int block_size_value = 0, int bit_depth_value = 0)
|
||||
: pred_fn(pred), ref_fn(ref),
|
||||
block_size(block_size_value), bit_depth(bit_depth_value) {}
|
||||
|
||||
IntraPred pred_fn;
|
||||
IntraPred ref_fn;
|
||||
int block_size;
|
||||
int bit_depth;
|
||||
};
|
||||
|
||||
class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
|
||||
public:
|
||||
virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
virtual void Predict() = 0;
|
||||
|
||||
void CheckPrediction(int test_case_number, int *error_count) const {
|
||||
// For each pixel ensure that the calculated value is the same as reference.
|
||||
for (int y = 0; y < block_size_; y++) {
|
||||
for (int x = 0; x < block_size_; x++) {
|
||||
*error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
|
||||
if (*error_count == 1) {
|
||||
ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
|
||||
<< " Failed on Test Case Number "<< test_case_number;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RunTest(uint16_t* left_col, uint16_t* above_data,
|
||||
uint16_t* dst, uint16_t* ref_dst) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
const int block_size = params_.block_size;
|
||||
above_row_ = above_data + 16;
|
||||
left_col_ = left_col;
|
||||
dst_ = dst;
|
||||
ref_dst_ = ref_dst;
|
||||
above_row_ = above_data + 16;
|
||||
int error_count = 0;
|
||||
for (int i = 0; i < count_test_block; ++i) {
|
||||
// Fill edges with random data, try first with saturated values.
|
||||
for (int x = -1; x <= block_size_*2; x++) {
|
||||
for (int x = -1; x <= block_size * 2; x++) {
|
||||
if (i == 0) {
|
||||
above_row_[x] = mask_;
|
||||
} else {
|
||||
above_row_[x] = rnd.Rand16() & mask_;
|
||||
}
|
||||
}
|
||||
for (int y = 0; y < block_size_; y++) {
|
||||
for (int y = 0; y < block_size; y++) {
|
||||
if (i == 0) {
|
||||
left_col_[y] = mask_;
|
||||
} else {
|
||||
@@ -79,43 +77,42 @@ class VP9IntraPredBase {
|
||||
ASSERT_EQ(0, error_count);
|
||||
}
|
||||
|
||||
int block_size_;
|
||||
protected:
|
||||
virtual void SetUp() {
|
||||
params_ = GetParam();
|
||||
stride_ = params_.block_size * 3;
|
||||
mask_ = (1 << params_.bit_depth) - 1;
|
||||
}
|
||||
|
||||
void Predict() {
|
||||
const int bit_depth = params_.bit_depth;
|
||||
params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
|
||||
ASM_REGISTER_STATE_CHECK(params_.pred_fn(dst_, stride_,
|
||||
above_row_, left_col_, bit_depth));
|
||||
}
|
||||
|
||||
void CheckPrediction(int test_case_number, int *error_count) const {
|
||||
// For each pixel ensure that the calculated value is the same as reference.
|
||||
const int block_size = params_.block_size;
|
||||
for (int y = 0; y < block_size; y++) {
|
||||
for (int x = 0; x < block_size; x++) {
|
||||
*error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
|
||||
if (*error_count == 1) {
|
||||
ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
|
||||
<< " Failed on Test Case Number "<< test_case_number;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint16_t *above_row_;
|
||||
uint16_t *left_col_;
|
||||
uint16_t *dst_;
|
||||
uint16_t *ref_dst_;
|
||||
ptrdiff_t stride_;
|
||||
int mask_;
|
||||
};
|
||||
|
||||
typedef void (*intra_pred_fn_t)(
|
||||
uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
|
||||
const uint16_t *left, int bps);
|
||||
typedef std::tr1::tuple<intra_pred_fn_t,
|
||||
intra_pred_fn_t, int, int> intra_pred_params_t;
|
||||
class VP9IntraPredTest
|
||||
: public VP9IntraPredBase,
|
||||
public ::testing::TestWithParam<intra_pred_params_t> {
|
||||
|
||||
virtual void SetUp() {
|
||||
pred_fn_ = GET_PARAM(0);
|
||||
ref_fn_ = GET_PARAM(1);
|
||||
block_size_ = GET_PARAM(2);
|
||||
bit_depth_ = GET_PARAM(3);
|
||||
stride_ = block_size_ * 3;
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
}
|
||||
|
||||
virtual void Predict() {
|
||||
const uint16_t *const_above_row = above_row_;
|
||||
const uint16_t *const_left_col = left_col_;
|
||||
ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_);
|
||||
ASM_REGISTER_STATE_CHECK(pred_fn_(dst_, stride_, const_above_row,
|
||||
const_left_col, bit_depth_));
|
||||
}
|
||||
intra_pred_fn_t pred_fn_;
|
||||
intra_pred_fn_t ref_fn_;
|
||||
int bit_depth_;
|
||||
IntraPredFunc params_;
|
||||
};
|
||||
|
||||
TEST_P(VP9IntraPredTest, IntraPredTests) {
|
||||
@@ -127,105 +124,89 @@ TEST_P(VP9IntraPredTest, IntraPredTests) {
|
||||
RunTest(left_col, above_data, dst, ref_dst);
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if HAVE_SSE2
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
#if CONFIG_USE_X86INC
|
||||
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
|
||||
::testing::Values(
|
||||
make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
|
||||
&vpx_highbd_dc_predictor_32x32_c, 32, 8),
|
||||
make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
|
||||
&vpx_highbd_tm_predictor_16x16_c, 16, 8),
|
||||
make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
|
||||
&vpx_highbd_tm_predictor_32x32_c, 32, 8),
|
||||
make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
|
||||
&vpx_highbd_dc_predictor_4x4_c, 4, 8),
|
||||
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
|
||||
&vpx_highbd_dc_predictor_8x8_c, 8, 8),
|
||||
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
|
||||
&vpx_highbd_dc_predictor_16x16_c, 16, 8),
|
||||
make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
|
||||
&vpx_highbd_v_predictor_4x4_c, 4, 8),
|
||||
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
|
||||
&vpx_highbd_v_predictor_8x8_c, 8, 8),
|
||||
make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
|
||||
&vpx_highbd_v_predictor_16x16_c, 16, 8),
|
||||
make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
|
||||
&vpx_highbd_v_predictor_32x32_c, 32, 8),
|
||||
make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
|
||||
&vpx_highbd_tm_predictor_4x4_c, 4, 8),
|
||||
make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
|
||||
&vpx_highbd_tm_predictor_8x8_c, 8, 8)));
|
||||
::testing::Values(
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
|
||||
&vpx_highbd_dc_predictor_32x32_c, 32, 8),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
|
||||
&vpx_highbd_tm_predictor_16x16_c, 16, 8),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
|
||||
&vpx_highbd_tm_predictor_32x32_c, 32, 8),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
|
||||
&vpx_highbd_dc_predictor_4x4_c, 4, 8),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
|
||||
&vpx_highbd_dc_predictor_8x8_c, 8, 8),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
|
||||
&vpx_highbd_dc_predictor_16x16_c, 16, 8),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
|
||||
&vpx_highbd_v_predictor_4x4_c, 4, 8),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
|
||||
&vpx_highbd_v_predictor_8x8_c, 8, 8),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
|
||||
&vpx_highbd_v_predictor_16x16_c, 16, 8),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
|
||||
&vpx_highbd_v_predictor_32x32_c, 32, 8),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
|
||||
&vpx_highbd_tm_predictor_4x4_c, 4, 8),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
|
||||
&vpx_highbd_tm_predictor_8x8_c, 8, 8)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
|
||||
::testing::Values(
|
||||
make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
|
||||
&vpx_highbd_dc_predictor_32x32_c, 32,
|
||||
10),
|
||||
make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
|
||||
&vpx_highbd_tm_predictor_16x16_c, 16,
|
||||
10),
|
||||
make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
|
||||
&vpx_highbd_tm_predictor_32x32_c, 32,
|
||||
10),
|
||||
make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
|
||||
&vpx_highbd_dc_predictor_4x4_c, 4, 10),
|
||||
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
|
||||
&vpx_highbd_dc_predictor_8x8_c, 8, 10),
|
||||
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
|
||||
&vpx_highbd_dc_predictor_16x16_c, 16,
|
||||
10),
|
||||
make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
|
||||
&vpx_highbd_v_predictor_4x4_c, 4, 10),
|
||||
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
|
||||
&vpx_highbd_v_predictor_8x8_c, 8, 10),
|
||||
make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
|
||||
&vpx_highbd_v_predictor_16x16_c, 16,
|
||||
10),
|
||||
make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
|
||||
&vpx_highbd_v_predictor_32x32_c, 32,
|
||||
10),
|
||||
make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
|
||||
&vpx_highbd_tm_predictor_4x4_c, 4, 10),
|
||||
make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
|
||||
&vpx_highbd_tm_predictor_8x8_c, 8, 10)));
|
||||
::testing::Values(
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
|
||||
&vpx_highbd_dc_predictor_32x32_c, 32, 10),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
|
||||
&vpx_highbd_tm_predictor_16x16_c, 16, 10),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
|
||||
&vpx_highbd_tm_predictor_32x32_c, 32, 10),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
|
||||
&vpx_highbd_dc_predictor_4x4_c, 4, 10),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
|
||||
&vpx_highbd_dc_predictor_8x8_c, 8, 10),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
|
||||
&vpx_highbd_dc_predictor_16x16_c, 16, 10),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
|
||||
&vpx_highbd_v_predictor_4x4_c, 4, 10),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
|
||||
&vpx_highbd_v_predictor_8x8_c, 8, 10),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
|
||||
&vpx_highbd_v_predictor_16x16_c, 16, 10),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
|
||||
&vpx_highbd_v_predictor_32x32_c, 32, 10),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
|
||||
&vpx_highbd_tm_predictor_4x4_c, 4, 10),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
|
||||
&vpx_highbd_tm_predictor_8x8_c, 8, 10)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
|
||||
::testing::Values(
|
||||
make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
|
||||
&vpx_highbd_dc_predictor_32x32_c, 32,
|
||||
12),
|
||||
make_tuple(&vpx_highbd_tm_predictor_16x16_sse2,
|
||||
&vpx_highbd_tm_predictor_16x16_c, 16,
|
||||
12),
|
||||
make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
|
||||
&vpx_highbd_tm_predictor_32x32_c, 32,
|
||||
12),
|
||||
make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
|
||||
&vpx_highbd_dc_predictor_4x4_c, 4, 12),
|
||||
make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
|
||||
&vpx_highbd_dc_predictor_8x8_c, 8, 12),
|
||||
make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
|
||||
&vpx_highbd_dc_predictor_16x16_c, 16,
|
||||
12),
|
||||
make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
|
||||
&vpx_highbd_v_predictor_4x4_c, 4, 12),
|
||||
make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
|
||||
&vpx_highbd_v_predictor_8x8_c, 8, 12),
|
||||
make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
|
||||
&vpx_highbd_v_predictor_16x16_c, 16,
|
||||
12),
|
||||
make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
|
||||
&vpx_highbd_v_predictor_32x32_c, 32,
|
||||
12),
|
||||
make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
|
||||
&vpx_highbd_tm_predictor_4x4_c, 4, 12),
|
||||
make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
|
||||
&vpx_highbd_tm_predictor_8x8_c, 8, 12)));
|
||||
::testing::Values(
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
|
||||
&vpx_highbd_dc_predictor_32x32_c, 32, 12),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
|
||||
&vpx_highbd_tm_predictor_16x16_c, 16, 12),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
|
||||
&vpx_highbd_tm_predictor_32x32_c, 32, 12),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
|
||||
&vpx_highbd_dc_predictor_4x4_c, 4, 12),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
|
||||
&vpx_highbd_dc_predictor_8x8_c, 8, 12),
|
||||
IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
|
||||
&vpx_highbd_dc_predictor_16x16_c, 16, 12),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
|
||||
&vpx_highbd_v_predictor_4x4_c, 4, 12),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
|
||||
&vpx_highbd_v_predictor_8x8_c, 8, 12),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
|
||||
&vpx_highbd_v_predictor_16x16_c, 16, 12),
|
||||
IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
|
||||
&vpx_highbd_v_predictor_32x32_c, 32, 12),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
|
||||
&vpx_highbd_tm_predictor_4x4_c, 4, 12),
|
||||
IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
|
||||
&vpx_highbd_tm_predictor_8x8_c, 8, 12)));
|
||||
|
||||
#endif // CONFIG_USE_X86INC
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#endif // HAVE_SSE2
|
||||
} // namespace
|
||||
|
2
third_party/googletest/README.libvpx
vendored
2
third_party/googletest/README.libvpx
vendored
@@ -17,3 +17,5 @@ Local Modifications:
|
||||
- Added GTEST_ATTRIBUTE_UNUSED_ to test registering dummies in TEST_P
|
||||
and INSTANTIATE_TEST_CASE_P to remove warnings about unused variables
|
||||
under GCC 5.
|
||||
- Only define g_in_fast_death_test_child for non-Windows builds; quiets an
|
||||
unused variable warning.
|
||||
|
2
third_party/googletest/src/src/gtest-all.cc
vendored
2
third_party/googletest/src/src/gtest-all.cc
vendored
@@ -6612,9 +6612,11 @@ GTEST_DEFINE_string_(
|
||||
|
||||
namespace internal {
|
||||
|
||||
# if !GTEST_OS_WINDOWS
|
||||
// Valid only for fast death tests. Indicates the code is running in the
|
||||
// child process of a fast style death test.
|
||||
static bool g_in_fast_death_test_child = false;
|
||||
# endif // !GTEST_OS_WINDOWS
|
||||
|
||||
// Returns a Boolean value indicating whether the caller is currently
|
||||
// executing in the context of the death test child process. Tools such as
|
||||
|
@@ -77,7 +77,8 @@ static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
|
||||
int bd) {
|
||||
int i;
|
||||
for (i = 0; i < 4; ++i)
|
||||
output[i] = (tran_low_t)highbd_dct_const_round_shift(input[i] * Sqrt2, bd);
|
||||
output[i] = HIGHBD_WRAPLOW(
|
||||
highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
|
||||
}
|
||||
|
||||
static void highbd_iidtx8_c(const tran_low_t *input, tran_low_t *output,
|
||||
@@ -92,8 +93,8 @@ static void highbd_iidtx16_c(const tran_low_t *input, tran_low_t *output,
|
||||
int bd) {
|
||||
int i;
|
||||
for (i = 0; i < 16; ++i)
|
||||
output[i] = (tran_low_t)highbd_dct_const_round_shift(
|
||||
input[i] * 2 * Sqrt2, bd);
|
||||
output[i] = HIGHBD_WRAPLOW(
|
||||
highbd_dct_const_round_shift(input[i] * 2 * Sqrt2), bd);
|
||||
}
|
||||
|
||||
static void highbd_iidtx32_c(const tran_low_t *input, tran_low_t *output,
|
||||
@@ -113,8 +114,8 @@ static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
|
||||
}
|
||||
// Multiply input by sqrt(2)
|
||||
for (i = 0; i < 16; ++i) {
|
||||
inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
|
||||
input[i] * Sqrt2, bd);
|
||||
inputhalf[i] = HIGHBD_WRAPLOW(
|
||||
highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
|
||||
}
|
||||
vpx_highbd_idct16_c(inputhalf, output + 16, bd);
|
||||
// Note overall scaling factor is 4 times orthogonal
|
||||
@@ -190,18 +191,18 @@ void highbd_idst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
// stage 1
|
||||
temp1 = (input[3] + input[1]) * cospi_16_64;
|
||||
temp2 = (input[3] - input[1]) * cospi_16_64;
|
||||
step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
temp1 = input[2] * cospi_24_64 - input[0] * cospi_8_64;
|
||||
temp2 = input[2] * cospi_8_64 + input[0] * cospi_24_64;
|
||||
step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
// stage 2
|
||||
output[0] = WRAPLOW(step[0] + step[3], bd);
|
||||
output[1] = WRAPLOW(-step[1] - step[2], bd);
|
||||
output[2] = WRAPLOW(step[1] - step[2], bd);
|
||||
output[3] = WRAPLOW(step[3] - step[0], bd);
|
||||
output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
|
||||
output[1] = HIGHBD_WRAPLOW(-step[1] - step[2], bd);
|
||||
output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
|
||||
output[3] = HIGHBD_WRAPLOW(step[3] - step[0], bd);
|
||||
}
|
||||
|
||||
void highbd_idst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
@@ -215,48 +216,48 @@ void highbd_idst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
step1[3] = input[1];
|
||||
temp1 = input[6] * cospi_28_64 - input[0] * cospi_4_64;
|
||||
temp2 = input[6] * cospi_4_64 + input[0] * cospi_28_64;
|
||||
step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
temp1 = input[2] * cospi_12_64 - input[4] * cospi_20_64;
|
||||
temp2 = input[2] * cospi_20_64 + input[4] * cospi_12_64;
|
||||
step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
// stage 2
|
||||
temp1 = (step1[0] + step1[2]) * cospi_16_64;
|
||||
temp2 = (step1[0] - step1[2]) * cospi_16_64;
|
||||
step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
|
||||
temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
|
||||
step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
|
||||
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
|
||||
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
|
||||
step2[7] = WRAPLOW(step1[6] + step1[7], bd);
|
||||
step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
|
||||
step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
|
||||
step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
|
||||
step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
|
||||
|
||||
// stage 3
|
||||
step1[0] = WRAPLOW(step2[0] + step2[3], bd);
|
||||
step1[1] = WRAPLOW(step2[1] + step2[2], bd);
|
||||
step1[2] = WRAPLOW(step2[1] - step2[2], bd);
|
||||
step1[3] = WRAPLOW(step2[0] - step2[3], bd);
|
||||
step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
|
||||
step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
|
||||
step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
|
||||
step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
|
||||
step1[4] = step2[4];
|
||||
temp1 = (step2[6] - step2[5]) * cospi_16_64;
|
||||
temp2 = (step2[5] + step2[6]) * cospi_16_64;
|
||||
step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[7] = step2[7];
|
||||
|
||||
// stage 4
|
||||
output[0] = WRAPLOW(step1[0] + step1[7], bd);
|
||||
output[1] = WRAPLOW(-step1[1] - step1[6], bd);
|
||||
output[2] = WRAPLOW(step1[2] + step1[5], bd);
|
||||
output[3] = WRAPLOW(-step1[3] - step1[4], bd);
|
||||
output[4] = WRAPLOW(step1[3] - step1[4], bd);
|
||||
output[5] = WRAPLOW(-step1[2] + step1[5], bd);
|
||||
output[6] = WRAPLOW(step1[1] - step1[6], bd);
|
||||
output[7] = WRAPLOW(-step1[0] + step1[7], bd);
|
||||
output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
|
||||
output[1] = HIGHBD_WRAPLOW(-step1[1] - step1[6], bd);
|
||||
output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
|
||||
output[3] = HIGHBD_WRAPLOW(-step1[3] - step1[4], bd);
|
||||
output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
|
||||
output[5] = HIGHBD_WRAPLOW(-step1[2] + step1[5], bd);
|
||||
output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
|
||||
output[7] = HIGHBD_WRAPLOW(-step1[0] + step1[7], bd);
|
||||
}
|
||||
|
||||
void highbd_idst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
@@ -295,23 +296,23 @@ void highbd_idst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
|
||||
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
|
||||
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
|
||||
step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
|
||||
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
|
||||
step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
|
||||
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
|
||||
step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
|
||||
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
|
||||
step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
// stage 3
|
||||
step1[0] = step2[0];
|
||||
@@ -321,109 +322,109 @@ void highbd_idst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
|
||||
|
||||
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
|
||||
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
|
||||
step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
|
||||
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
|
||||
step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
|
||||
step1[8] = WRAPLOW(step2[8] + step2[9], bd);
|
||||
step1[9] = WRAPLOW(step2[8] - step2[9], bd);
|
||||
step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
|
||||
step1[11] = WRAPLOW(step2[10] + step2[11], bd);
|
||||
step1[12] = WRAPLOW(step2[12] + step2[13], bd);
|
||||
step1[13] = WRAPLOW(step2[12] - step2[13], bd);
|
||||
step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
|
||||
step1[15] = WRAPLOW(step2[14] + step2[15], bd);
|
||||
step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
|
||||
step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
|
||||
step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
|
||||
step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
|
||||
step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
|
||||
step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
|
||||
step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
|
||||
step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
|
||||
|
||||
// stage 4
|
||||
temp1 = (step1[0] + step1[1]) * cospi_16_64;
|
||||
temp2 = (step1[0] - step1[1]) * cospi_16_64;
|
||||
step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
|
||||
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
|
||||
step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
|
||||
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
|
||||
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
|
||||
step2[7] = WRAPLOW(step1[6] + step1[7], bd);
|
||||
step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
|
||||
step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
|
||||
step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
|
||||
step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
|
||||
|
||||
step2[8] = step1[8];
|
||||
step2[15] = step1[15];
|
||||
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
|
||||
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
|
||||
step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
|
||||
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
|
||||
step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[11] = step1[11];
|
||||
step2[12] = step1[12];
|
||||
|
||||
// stage 5
|
||||
step1[0] = WRAPLOW(step2[0] + step2[3], bd);
|
||||
step1[1] = WRAPLOW(step2[1] + step2[2], bd);
|
||||
step1[2] = WRAPLOW(step2[1] - step2[2], bd);
|
||||
step1[3] = WRAPLOW(step2[0] - step2[3], bd);
|
||||
step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
|
||||
step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
|
||||
step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
|
||||
step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
|
||||
step1[4] = step2[4];
|
||||
temp1 = (step2[6] - step2[5]) * cospi_16_64;
|
||||
temp2 = (step2[5] + step2[6]) * cospi_16_64;
|
||||
step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step1[7] = step2[7];
|
||||
|
||||
step1[8] = WRAPLOW(step2[8] + step2[11], bd);
|
||||
step1[9] = WRAPLOW(step2[9] + step2[10], bd);
|
||||
step1[10] = WRAPLOW(step2[9] - step2[10], bd);
|
||||
step1[11] = WRAPLOW(step2[8] - step2[11], bd);
|
||||
step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
|
||||
step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
|
||||
step1[14] = WRAPLOW(step2[13] + step2[14], bd);
|
||||
step1[15] = WRAPLOW(step2[12] + step2[15], bd);
|
||||
step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
|
||||
step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
|
||||
step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
|
||||
step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
|
||||
step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
|
||||
step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
|
||||
step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
|
||||
step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
|
||||
|
||||
// stage 6
|
||||
step2[0] = WRAPLOW(step1[0] + step1[7], bd);
|
||||
step2[1] = WRAPLOW(step1[1] + step1[6], bd);
|
||||
step2[2] = WRAPLOW(step1[2] + step1[5], bd);
|
||||
step2[3] = WRAPLOW(step1[3] + step1[4], bd);
|
||||
step2[4] = WRAPLOW(step1[3] - step1[4], bd);
|
||||
step2[5] = WRAPLOW(step1[2] - step1[5], bd);
|
||||
step2[6] = WRAPLOW(step1[1] - step1[6], bd);
|
||||
step2[7] = WRAPLOW(step1[0] - step1[7], bd);
|
||||
step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
|
||||
step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
|
||||
step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
|
||||
step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
|
||||
step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
|
||||
step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
|
||||
step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
|
||||
step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
|
||||
step2[8] = step1[8];
|
||||
step2[9] = step1[9];
|
||||
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
|
||||
temp2 = (step1[10] + step1[13]) * cospi_16_64;
|
||||
step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
|
||||
temp2 = (step1[11] + step1[12]) * cospi_16_64;
|
||||
step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
|
||||
step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
|
||||
step2[14] = step1[14];
|
||||
step2[15] = step1[15];
|
||||
|
||||
// stage 7
|
||||
output[0] = WRAPLOW(step2[0] + step2[15], bd);
|
||||
output[1] = WRAPLOW(-step2[1] - step2[14], bd);
|
||||
output[2] = WRAPLOW(step2[2] + step2[13], bd);
|
||||
output[3] = WRAPLOW(-step2[3] - step2[12], bd);
|
||||
output[4] = WRAPLOW(step2[4] + step2[11], bd);
|
||||
output[5] = WRAPLOW(-step2[5] - step2[10], bd);
|
||||
output[6] = WRAPLOW(step2[6] + step2[9], bd);
|
||||
output[7] = WRAPLOW(-step2[7] - step2[8], bd);
|
||||
output[8] = WRAPLOW(step2[7] - step2[8], bd);
|
||||
output[9] = WRAPLOW(-step2[6] + step2[9], bd);
|
||||
output[10] = WRAPLOW(step2[5] - step2[10], bd);
|
||||
output[11] = WRAPLOW(-step2[4] + step2[11], bd);
|
||||
output[12] = WRAPLOW(step2[3] - step2[12], bd);
|
||||
output[13] = WRAPLOW(-step2[2] + step2[13], bd);
|
||||
output[14] = WRAPLOW(step2[1] - step2[14], bd);
|
||||
output[15] = WRAPLOW(-step2[0] + step2[15], bd);
|
||||
output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
|
||||
output[1] = HIGHBD_WRAPLOW(-step2[1] - step2[14], bd);
|
||||
output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
|
||||
output[3] = HIGHBD_WRAPLOW(-step2[3] - step2[12], bd);
|
||||
output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
|
||||
output[5] = HIGHBD_WRAPLOW(-step2[5] - step2[10], bd);
|
||||
output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
|
||||
output[7] = HIGHBD_WRAPLOW(-step2[7] - step2[8], bd);
|
||||
output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
|
||||
output[9] = HIGHBD_WRAPLOW(-step2[6] + step2[9], bd);
|
||||
output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
|
||||
output[11] = HIGHBD_WRAPLOW(-step2[4] + step2[11], bd);
|
||||
output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
|
||||
output[13] = HIGHBD_WRAPLOW(-step2[2] + step2[13], bd);
|
||||
output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
|
||||
output[15] = HIGHBD_WRAPLOW(-step2[0] + step2[15], bd);
|
||||
}
|
||||
|
||||
static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
|
||||
|
@@ -24,29 +24,6 @@ EOF
|
||||
}
|
||||
forward_decls qw/vp10_common_forward_decls/;
|
||||
|
||||
# x86inc.asm had specific constraints. break it out so it's easy to disable.
|
||||
# zero all the variables to avoid tricky else conditions.
|
||||
$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
|
||||
$avx2_x86inc = '';
|
||||
$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
|
||||
$ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
|
||||
if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
|
||||
$mmx_x86inc = 'mmx';
|
||||
$sse_x86inc = 'sse';
|
||||
$sse2_x86inc = 'sse2';
|
||||
$ssse3_x86inc = 'ssse3';
|
||||
$avx_x86inc = 'avx';
|
||||
$avx2_x86inc = 'avx2';
|
||||
if ($opts{arch} eq "x86_64") {
|
||||
$mmx_x86_64_x86inc = 'mmx';
|
||||
$sse_x86_64_x86inc = 'sse';
|
||||
$sse2_x86_64_x86inc = 'sse2';
|
||||
$ssse3_x86_64_x86inc = 'ssse3';
|
||||
$avx_x86_64_x86inc = 'avx';
|
||||
$avx2_x86_64_x86inc = 'avx2';
|
||||
}
|
||||
}
|
||||
|
||||
# functions that are 64 bit only.
|
||||
$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
|
||||
if ($opts{arch} eq "x86_64") {
|
||||
@@ -409,16 +386,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp10_fdct8x8_quant/;
|
||||
} else {
|
||||
add_proto qw/int64_t vp10_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
|
||||
specialize qw/vp10_block_error avx2 msa/, "$sse2_x86inc";
|
||||
specialize qw/vp10_block_error avx2 msa/;
|
||||
|
||||
add_proto qw/int64_t vp10_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
|
||||
specialize qw/vp10_block_error_fp neon/, "$sse2_x86inc";
|
||||
specialize qw/vp10_block_error_fp neon sse2/;
|
||||
|
||||
add_proto qw/void vp10_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp10_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
|
||||
specialize qw/vp10_quantize_fp neon sse2/;
|
||||
|
||||
add_proto qw/void vp10_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp10_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
|
||||
specialize qw/vp10_quantize_fp_32x32/;
|
||||
|
||||
add_proto qw/void vp10_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp10_fdct8x8_quant sse2 ssse3 neon/;
|
||||
@@ -440,7 +417,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp10_fht32x32/;
|
||||
|
||||
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fwht4x4/, "$sse2_x86inc";
|
||||
specialize qw/vp10_fwht4x4/;
|
||||
} else {
|
||||
add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp10_fht4x4 sse2/;
|
||||
@@ -461,7 +438,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp10_fht32x32/;
|
||||
|
||||
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc";
|
||||
specialize qw/vp10_fwht4x4/;
|
||||
}
|
||||
|
||||
add_proto qw/void vp10_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
|
||||
|
@@ -103,10 +103,8 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
|
||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
|
@@ -1,801 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vp8/common/mips/msa/vp8_macros_msa.h"
|
||||
|
||||
static const int16_t vp8_rv_msa[] =
|
||||
{
|
||||
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
|
||||
0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
|
||||
10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
|
||||
8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
|
||||
8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
|
||||
1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
|
||||
3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
|
||||
11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
|
||||
14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
|
||||
4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
|
||||
7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
|
||||
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
|
||||
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
|
||||
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
|
||||
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
|
||||
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
|
||||
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
|
||||
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
|
||||
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
|
||||
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
|
||||
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
|
||||
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
|
||||
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
|
||||
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
|
||||
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
|
||||
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
|
||||
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
|
||||
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
|
||||
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
|
||||
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
|
||||
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
|
||||
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
|
||||
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
|
||||
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
|
||||
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
|
||||
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
|
||||
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
|
||||
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
|
||||
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
|
||||
3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
|
||||
11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
|
||||
14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
|
||||
5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
|
||||
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
|
||||
};
|
||||
|
||||
#define VP8_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3, \
|
||||
out4, out5, out6, out7, \
|
||||
out8, out9, out10, out11, \
|
||||
out12, out13, out14, out15) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3, temp4; \
|
||||
v8i16 temp5, temp6, temp7, temp8, temp9; \
|
||||
\
|
||||
ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
|
||||
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
|
||||
ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_UB(temp5, temp4, out8, out10); \
|
||||
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_UB(temp5, temp4, out12, out14); \
|
||||
out0 = (v16u8)temp6; \
|
||||
out2 = (v16u8)temp7; \
|
||||
out4 = (v16u8)temp8; \
|
||||
out6 = (v16u8)temp9; \
|
||||
out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
|
||||
out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
|
||||
out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
|
||||
out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
|
||||
out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
|
||||
out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
|
||||
out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
|
||||
out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
|
||||
}
|
||||
|
||||
#define VP8_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
|
||||
below1_in, below2_in, ref, out) \
|
||||
{ \
|
||||
v16u8 temp0, temp1; \
|
||||
\
|
||||
temp1 = __msa_aver_u_b(above2_in, above1_in); \
|
||||
temp0 = __msa_aver_u_b(below2_in, below1_in); \
|
||||
temp1 = __msa_aver_u_b(temp1, temp0); \
|
||||
out = __msa_aver_u_b(src_in, temp1); \
|
||||
temp0 = __msa_asub_u_b(src_in, above2_in); \
|
||||
temp1 = __msa_asub_u_b(src_in, above1_in); \
|
||||
temp0 = (temp0 < ref); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
temp1 = __msa_asub_u_b(src_in, below1_in); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
temp1 = __msa_asub_u_b(src_in, below2_in); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
out = __msa_bmz_v(out, src_in, temp0); \
|
||||
}
|
||||
|
||||
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3, temp4; \
|
||||
v8i16 temp5, temp6, temp7, temp8, temp9; \
|
||||
\
|
||||
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
|
||||
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
|
||||
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
|
||||
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
|
||||
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
|
||||
ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
|
||||
ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
|
||||
ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
|
||||
ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
|
||||
ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
|
||||
ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
|
||||
in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
|
||||
in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
|
||||
ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
|
||||
in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
|
||||
in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
|
||||
ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
|
||||
temp2, temp3, temp4, temp5); \
|
||||
ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
|
||||
temp6, temp7, temp8, temp9); \
|
||||
ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
|
||||
in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
|
||||
in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
|
||||
ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
|
||||
in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
|
||||
in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
|
||||
}
|
||||
|
||||
#define VP8_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
|
||||
in6, in7, in8, in9, in10, in11) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3; \
|
||||
v8i16 temp4, temp5, temp6, temp7; \
|
||||
\
|
||||
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
|
||||
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
|
||||
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
|
||||
ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
|
||||
temp4 = __msa_ilvr_h(temp5, temp4); \
|
||||
ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
|
||||
temp5 = __msa_ilvr_h(temp7, temp6); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
|
||||
in0 = (v16u8)temp0; \
|
||||
in2 = (v16u8)temp1; \
|
||||
in4 = (v16u8)temp2; \
|
||||
in6 = (v16u8)temp3; \
|
||||
in8 = (v16u8)temp6; \
|
||||
in10 = (v16u8)temp7; \
|
||||
in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
|
||||
in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
|
||||
in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
|
||||
in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
|
||||
in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
|
||||
in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
|
||||
}
|
||||
|
||||
static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride,
|
||||
int32_t cols, uint8_t *f)
|
||||
{
|
||||
uint8_t *p_src = src_ptr;
|
||||
uint8_t *p_dst = dst_ptr;
|
||||
uint8_t *f_orig = f;
|
||||
uint8_t *p_dst_st = dst_ptr;
|
||||
uint16_t col;
|
||||
uint64_t out0, out1, out2, out3;
|
||||
v16u8 above2, above1, below2, below1, src, ref, ref_temp;
|
||||
v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
|
||||
v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
|
||||
|
||||
for (col = (cols / 16); col--;)
|
||||
{
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
p_dst, dst_stride);
|
||||
|
||||
p_dst += 16;
|
||||
p_src += 16;
|
||||
f += 16;
|
||||
}
|
||||
|
||||
if (0 != (cols / 16))
|
||||
{
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
out0 = __msa_copy_u_d((v2i64)inter0, 0);
|
||||
out1 = __msa_copy_u_d((v2i64)inter1, 0);
|
||||
out2 = __msa_copy_u_d((v2i64)inter2, 0);
|
||||
out3 = __msa_copy_u_d((v2i64)inter3, 0);
|
||||
SD4(out0, out1, out2, out3, p_dst, dst_stride);
|
||||
|
||||
out0 = __msa_copy_u_d((v2i64)inter4, 0);
|
||||
out1 = __msa_copy_u_d((v2i64)inter5, 0);
|
||||
out2 = __msa_copy_u_d((v2i64)inter6, 0);
|
||||
out3 = __msa_copy_u_d((v2i64)inter7, 0);
|
||||
SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
|
||||
}
|
||||
|
||||
f = f_orig;
|
||||
p_dst = dst_ptr - 2;
|
||||
LD_UB8(p_dst, dst_stride,
|
||||
inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
|
||||
|
||||
for (col = 0; col < (cols / 8); ++col)
|
||||
{
|
||||
ref = LD_UB(f);
|
||||
f += 8;
|
||||
VP8_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3,
|
||||
inter4, inter5, inter6, inter7,
|
||||
inter8, inter9, inter10, inter11);
|
||||
if (0 == col)
|
||||
{
|
||||
above2 = inter2;
|
||||
above1 = inter2;
|
||||
}
|
||||
else
|
||||
{
|
||||
above2 = inter0;
|
||||
above1 = inter1;
|
||||
}
|
||||
src = inter2;
|
||||
below1 = inter3;
|
||||
below2 = inter4;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
|
||||
ref_temp, inter2);
|
||||
above2 = inter5;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
|
||||
ref_temp, inter3);
|
||||
above1 = inter6;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
|
||||
ref_temp, inter4);
|
||||
src = inter7;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
|
||||
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
|
||||
ref_temp, inter5);
|
||||
below1 = inter8;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
|
||||
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
|
||||
ref_temp, inter6);
|
||||
below2 = inter9;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
|
||||
ref_temp, inter7);
|
||||
if (col == (cols / 8 - 1))
|
||||
{
|
||||
above2 = inter9;
|
||||
}
|
||||
else
|
||||
{
|
||||
above2 = inter10;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
|
||||
ref_temp, inter8);
|
||||
if (col == (cols / 8 - 1))
|
||||
{
|
||||
above1 = inter9;
|
||||
}
|
||||
else
|
||||
{
|
||||
above1 = inter11;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
|
||||
ref_temp, inter9);
|
||||
TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
inter8, inter9, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9);
|
||||
p_dst += 8;
|
||||
LD_UB2(p_dst, dst_stride, inter0, inter1);
|
||||
ST8x1_UB(inter2, p_dst_st);
|
||||
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
|
||||
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
|
||||
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
|
||||
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
|
||||
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
|
||||
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
|
||||
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
|
||||
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
|
||||
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
|
||||
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
|
||||
p_dst_st += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride,
|
||||
int32_t cols, uint8_t *f)
|
||||
{
|
||||
uint8_t *p_src = src_ptr;
|
||||
uint8_t *p_dst = dst_ptr;
|
||||
uint8_t *p_dst_st = dst_ptr;
|
||||
uint8_t *f_orig = f;
|
||||
uint16_t col;
|
||||
v16u8 above2, above1, below2, below1;
|
||||
v16u8 src, ref, ref_temp;
|
||||
v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
|
||||
v16u8 inter7, inter8, inter9, inter10, inter11;
|
||||
v16u8 inter12, inter13, inter14, inter15;
|
||||
|
||||
for (col = (cols / 16); col--;)
|
||||
{
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
src = LD_UB(p_src + 10 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
|
||||
below1 = LD_UB(p_src + 11 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
|
||||
below2 = LD_UB(p_src + 12 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
|
||||
above2 = LD_UB(p_src + 13 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
|
||||
above1 = LD_UB(p_src + 14 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
|
||||
src = LD_UB(p_src + 15 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
|
||||
below1 = LD_UB(p_src + 16 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
|
||||
below2 = LD_UB(p_src + 17 * src_stride);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
|
||||
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
p_dst, dst_stride);
|
||||
ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13,
|
||||
inter14, inter15, p_dst + 8 * dst_stride, dst_stride);
|
||||
p_src += 16;
|
||||
p_dst += 16;
|
||||
f += 16;
|
||||
}
|
||||
|
||||
f = f_orig;
|
||||
p_dst = dst_ptr - 2;
|
||||
LD_UB8(p_dst, dst_stride,
|
||||
inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7);
|
||||
LD_UB8(p_dst + 8 * dst_stride, dst_stride,
|
||||
inter8, inter9, inter10, inter11, inter12, inter13,
|
||||
inter14, inter15);
|
||||
|
||||
for (col = 0; col < cols / 8; ++col)
|
||||
{
|
||||
ref = LD_UB(f);
|
||||
f += 8;
|
||||
TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9, inter10, inter11,
|
||||
inter12, inter13, inter14, inter15);
|
||||
if (0 == col)
|
||||
{
|
||||
above2 = inter2;
|
||||
above1 = inter2;
|
||||
}
|
||||
else
|
||||
{
|
||||
above2 = inter0;
|
||||
above1 = inter1;
|
||||
}
|
||||
|
||||
src = inter2;
|
||||
below1 = inter3;
|
||||
below2 = inter4;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
|
||||
ref_temp, inter2);
|
||||
above2 = inter5;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
|
||||
ref_temp, inter3);
|
||||
above1 = inter6;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
|
||||
ref_temp, inter4);
|
||||
src = inter7;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
|
||||
VP8_AVER_IF_RETAIN(below1, below2, above2, above1, src,
|
||||
ref_temp, inter5);
|
||||
below1 = inter8;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
|
||||
VP8_AVER_IF_RETAIN(below2, above2, above1, src, below1,
|
||||
ref_temp, inter6);
|
||||
below2 = inter9;
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
|
||||
VP8_AVER_IF_RETAIN(above2, above1, src, below1, below2,
|
||||
ref_temp, inter7);
|
||||
if (col == (cols / 8 - 1))
|
||||
{
|
||||
above2 = inter9;
|
||||
}
|
||||
else
|
||||
{
|
||||
above2 = inter10;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
|
||||
VP8_AVER_IF_RETAIN(above1, src, below1, below2, above2,
|
||||
ref_temp, inter8);
|
||||
if (col == (cols / 8 - 1))
|
||||
{
|
||||
above1 = inter9;
|
||||
}
|
||||
else
|
||||
{
|
||||
above1 = inter11;
|
||||
}
|
||||
ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
|
||||
VP8_AVER_IF_RETAIN(src, below1, below2, above2, above1,
|
||||
ref_temp, inter9);
|
||||
VP8_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9,
|
||||
inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9,
|
||||
inter10, inter11, inter12, inter13,
|
||||
inter14, inter15, above2, above1);
|
||||
|
||||
p_dst += 8;
|
||||
LD_UB2(p_dst, dst_stride, inter0, inter1);
|
||||
ST8x1_UB(inter2, p_dst_st);
|
||||
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
|
||||
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
|
||||
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
|
||||
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
|
||||
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
|
||||
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
|
||||
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
|
||||
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
|
||||
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
|
||||
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
|
||||
LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
|
||||
ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
|
||||
ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
|
||||
LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
|
||||
ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
|
||||
ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
|
||||
LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
|
||||
ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
|
||||
ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
|
||||
LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
|
||||
ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
|
||||
ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
|
||||
p_dst_st += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride,
|
||||
int32_t cols, uint8_t *f,
|
||||
int32_t size)
|
||||
{
|
||||
if (8 == size)
|
||||
{
|
||||
postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride,
|
||||
cols, f);
|
||||
}
|
||||
else if (16 == size)
|
||||
{
|
||||
postproc_down_across_luma_msa(src, dst, src_stride, dst_stride,
|
||||
cols, f);
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
|
||||
int32_t rows, int32_t cols, int32_t flimit)
|
||||
{
|
||||
int32_t row, col, cnt;
|
||||
uint8_t *src_dup = src_ptr;
|
||||
v16u8 src0, src, tmp_orig;
|
||||
v16u8 tmp = { 0 };
|
||||
v16i8 zero = { 0 };
|
||||
v8u16 sum_h, src_r_h, src_l_h;
|
||||
v4u32 src_r_w, src_l_w;
|
||||
v4i32 flimit_vec;
|
||||
|
||||
flimit_vec = __msa_fill_w(flimit);
|
||||
for (row = rows; row--;)
|
||||
{
|
||||
int32_t sum_sq = 0;
|
||||
int32_t sum = 0;
|
||||
src0 = (v16u8)__msa_fill_b(src_dup[0]);
|
||||
ST8x1_UB(src0, (src_dup - 8));
|
||||
|
||||
src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
|
||||
ST_UB(src0, src_dup + cols);
|
||||
src_dup[cols + 16] = src_dup[cols - 1];
|
||||
tmp_orig = (v16u8)__msa_ldi_b(0);
|
||||
tmp_orig[15] = tmp[15];
|
||||
src = LD_UB(src_dup - 8);
|
||||
src[15] = 0;
|
||||
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
|
||||
src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
|
||||
src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
|
||||
sum_sq = HADD_SW_S32(src_r_w);
|
||||
sum_sq += HADD_SW_S32(src_l_w);
|
||||
sum_h = __msa_hadd_u_h(src, src);
|
||||
sum = HADD_UH_U32(sum_h);
|
||||
{
|
||||
v16u8 src7, src8, src_r, src_l;
|
||||
v16i8 mask;
|
||||
v8u16 add_r, add_l;
|
||||
v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
|
||||
v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
|
||||
v4i32 sub0, sub1, sub2, sub3;
|
||||
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
|
||||
v4i32 mul0, mul1, mul2, mul3;
|
||||
v4i32 total0, total1, total2, total3;
|
||||
v8i16 const8 = __msa_fill_h(8);
|
||||
|
||||
src7 = LD_UB(src_dup + 7);
|
||||
src8 = LD_UB(src_dup - 8);
|
||||
for (col = 0; col < (cols >> 4); ++col)
|
||||
{
|
||||
ILVRL_B2_UB(src7, src8, src_r, src_l);
|
||||
HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
|
||||
|
||||
sum_r[0] = sum + sub_r[0];
|
||||
for (cnt = 0; cnt < 7; ++cnt)
|
||||
{
|
||||
sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
|
||||
}
|
||||
sum_l[0] = sum_r[7] + sub_l[0];
|
||||
for (cnt = 0; cnt < 7; ++cnt)
|
||||
{
|
||||
sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
|
||||
}
|
||||
sum = sum_l[7];
|
||||
src = LD_UB(src_dup + 16 * col);
|
||||
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
|
||||
src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
|
||||
src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
|
||||
tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
|
||||
|
||||
HADD_UB2_UH(src_r, src_l, add_r, add_l);
|
||||
UNPCK_SH_SW(sub_r, sub0, sub1);
|
||||
UNPCK_SH_SW(sub_l, sub2, sub3);
|
||||
ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
|
||||
ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
|
||||
MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3,
|
||||
mul0, mul1, mul2, mul3);
|
||||
sum_sq0[0] = sum_sq + mul0[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt)
|
||||
{
|
||||
sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
|
||||
}
|
||||
sum_sq1[0] = sum_sq0[3] + mul1[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt)
|
||||
{
|
||||
sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
|
||||
}
|
||||
sum_sq2[0] = sum_sq1[3] + mul2[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt)
|
||||
{
|
||||
sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
|
||||
}
|
||||
sum_sq3[0] = sum_sq2[3] + mul3[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt)
|
||||
{
|
||||
sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
|
||||
}
|
||||
sum_sq = sum_sq3[3];
|
||||
|
||||
UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
|
||||
UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
|
||||
total0 = sum_sq0 * __msa_ldi_w(15);
|
||||
total0 -= sum0_w * sum0_w;
|
||||
total1 = sum_sq1 * __msa_ldi_w(15);
|
||||
total1 -= sum1_w * sum1_w;
|
||||
total2 = sum_sq2 * __msa_ldi_w(15);
|
||||
total2 -= sum2_w * sum2_w;
|
||||
total3 = sum_sq3 * __msa_ldi_w(15);
|
||||
total3 -= sum3_w * sum3_w;
|
||||
total0 = (total0 < flimit_vec);
|
||||
total1 = (total1 < flimit_vec);
|
||||
total2 = (total2 < flimit_vec);
|
||||
total3 = (total3 < flimit_vec);
|
||||
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
|
||||
mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
|
||||
tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
|
||||
|
||||
if (col == 0)
|
||||
{
|
||||
uint64_t src_d;
|
||||
|
||||
src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
|
||||
SD(src_d, (src_dup - 8));
|
||||
}
|
||||
|
||||
src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
|
||||
src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
|
||||
ST_UB(tmp, (src_dup + (16 * col)));
|
||||
}
|
||||
|
||||
src_dup += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
|
||||
int32_t cols, int32_t flimit)
|
||||
{
|
||||
int32_t row, col, cnt, i;
|
||||
const int16_t *rv3 = &vp8_rv_msa[63 & rand()];
|
||||
v4i32 flimit_vec;
|
||||
v16u8 dst7, dst8, dst_r_b, dst_l_b;
|
||||
v16i8 mask;
|
||||
v8u16 add_r, add_l;
|
||||
v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
|
||||
v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
|
||||
|
||||
flimit_vec = __msa_fill_w(flimit);
|
||||
|
||||
for (col = 0; col < (cols >> 4); ++col)
|
||||
{
|
||||
uint8_t *dst_tmp = &dst_ptr[col << 4];
|
||||
v16u8 dst;
|
||||
v16i8 zero = { 0 };
|
||||
v16u8 tmp[16];
|
||||
v8i16 mult0, mult1, rv2_0, rv2_1;
|
||||
v8i16 sum0_h = { 0 };
|
||||
v8i16 sum1_h = { 0 };
|
||||
v4i32 mul0 = { 0 };
|
||||
v4i32 mul1 = { 0 };
|
||||
v4i32 mul2 = { 0 };
|
||||
v4i32 mul3 = { 0 };
|
||||
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
|
||||
v4i32 add0, add1, add2, add3;
|
||||
const int16_t *rv2[16];
|
||||
|
||||
dst = LD_UB(dst_tmp);
|
||||
for (cnt = (col << 4), i = 0; i < 16; ++cnt)
|
||||
{
|
||||
rv2[i] = rv3 + ((cnt * 17) & 127);
|
||||
++i;
|
||||
}
|
||||
for (cnt = -8; cnt < 0; ++cnt)
|
||||
{
|
||||
ST_UB(dst, dst_tmp + cnt * pitch);
|
||||
}
|
||||
|
||||
dst = LD_UB((dst_tmp + (rows - 1) * pitch));
|
||||
for (cnt = rows; cnt < rows + 17; ++cnt)
|
||||
{
|
||||
ST_UB(dst, dst_tmp + cnt * pitch);
|
||||
}
|
||||
for (cnt = -8; cnt <= 6; ++cnt)
|
||||
{
|
||||
dst = LD_UB(dst_tmp + (cnt * pitch));
|
||||
UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
|
||||
MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
|
||||
mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
|
||||
mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
|
||||
mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
|
||||
mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
|
||||
ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
|
||||
}
|
||||
|
||||
for (row = 0; row < (rows + 8); ++row)
|
||||
{
|
||||
for (i = 0; i < 8; ++i)
|
||||
{
|
||||
rv2_0[i] = *(rv2[i] + (row & 127));
|
||||
rv2_1[i] = *(rv2[i + 8] + (row & 127));
|
||||
}
|
||||
dst7 = LD_UB(dst_tmp + (7 * pitch));
|
||||
dst8 = LD_UB(dst_tmp - (8 * pitch));
|
||||
ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
|
||||
|
||||
HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
|
||||
UNPCK_SH_SW(sub_r, sub0, sub1);
|
||||
UNPCK_SH_SW(sub_l, sub2, sub3);
|
||||
sum0_h += sub_r;
|
||||
sum1_h += sub_l;
|
||||
|
||||
HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
|
||||
|
||||
ILVRL_H2_SW(zero, add_r, add0, add1);
|
||||
ILVRL_H2_SW(zero, add_l, add2, add3);
|
||||
mul0 += add0 * sub0;
|
||||
mul1 += add1 * sub1;
|
||||
mul2 += add2 * sub2;
|
||||
mul3 += add3 * sub3;
|
||||
dst = LD_UB(dst_tmp);
|
||||
ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
|
||||
dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
|
||||
dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
|
||||
tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
|
||||
|
||||
UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
|
||||
UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
|
||||
total0 = mul0 * __msa_ldi_w(15);
|
||||
total0 -= sum0_w * sum0_w;
|
||||
total1 = mul1 * __msa_ldi_w(15);
|
||||
total1 -= sum1_w * sum1_w;
|
||||
total2 = mul2 * __msa_ldi_w(15);
|
||||
total2 -= sum2_w * sum2_w;
|
||||
total3 = mul3 * __msa_ldi_w(15);
|
||||
total3 -= sum3_w * sum3_w;
|
||||
total0 = (total0 < flimit_vec);
|
||||
total1 = (total1 < flimit_vec);
|
||||
total2 = (total2 < flimit_vec);
|
||||
total3 = (total3 < flimit_vec);
|
||||
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
|
||||
mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
|
||||
tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
|
||||
|
||||
if (row >= 8)
|
||||
{
|
||||
ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
|
||||
}
|
||||
|
||||
dst_tmp += pitch;
|
||||
}
|
||||
}
|
||||
}
|
@@ -12,6 +12,7 @@
|
||||
#include "vpx_config.h"
|
||||
#include "vpx_dsp_rtcd.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "vpx_dsp/postproc.h"
|
||||
#include "vpx_scale_rtcd.h"
|
||||
#include "vpx_scale/yv12config.h"
|
||||
#include "postproc.h"
|
||||
@@ -72,142 +73,11 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
|
||||
};
|
||||
#endif
|
||||
|
||||
const short vp8_rv[] =
|
||||
{
|
||||
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
|
||||
0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
|
||||
10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
|
||||
8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
|
||||
8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
|
||||
1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
|
||||
3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
|
||||
11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
|
||||
14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
|
||||
4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
|
||||
7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
|
||||
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
|
||||
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
|
||||
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
|
||||
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
|
||||
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
|
||||
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
|
||||
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
|
||||
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
|
||||
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
|
||||
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
|
||||
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
|
||||
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
|
||||
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
|
||||
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
|
||||
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
|
||||
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
|
||||
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
|
||||
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
|
||||
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
|
||||
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
|
||||
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
|
||||
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
|
||||
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
|
||||
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
|
||||
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
|
||||
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
|
||||
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
|
||||
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
|
||||
3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
|
||||
11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
|
||||
14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
|
||||
5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
|
||||
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
|
||||
};
|
||||
|
||||
extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
|
||||
extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
|
||||
/***********************************************************************************************************
|
||||
*/
|
||||
void vp8_post_proc_down_and_across_mb_row_c
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int cols,
|
||||
unsigned char *f,
|
||||
int size
|
||||
)
|
||||
{
|
||||
unsigned char *p_src, *p_dst;
|
||||
int row;
|
||||
int col;
|
||||
unsigned char v;
|
||||
unsigned char d[4];
|
||||
|
||||
for (row = 0; row < size; row++)
|
||||
{
|
||||
/* post_proc_down for one row */
|
||||
p_src = src_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
for (col = 0; col < cols; col++)
|
||||
{
|
||||
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
|
||||
unsigned char p_above1 = p_src[col - src_pixels_per_line];
|
||||
unsigned char p_below1 = p_src[col + src_pixels_per_line];
|
||||
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
|
||||
|
||||
v = p_src[col];
|
||||
|
||||
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
|
||||
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
|
||||
{
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_above2 + p_above1 + 1) >> 1;
|
||||
k2 = (p_below2 + p_below1 + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
p_dst[col] = v;
|
||||
}
|
||||
|
||||
/* now post_proc_across */
|
||||
p_src = dst_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
p_src[-2] = p_src[-1] = p_src[0];
|
||||
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
|
||||
|
||||
for (col = 0; col < cols; col++)
|
||||
{
|
||||
v = p_src[col];
|
||||
|
||||
if ((abs(v - p_src[col - 2]) < f[col])
|
||||
&& (abs(v - p_src[col - 1]) < f[col])
|
||||
&& (abs(v - p_src[col + 1]) < f[col])
|
||||
&& (abs(v - p_src[col + 2]) < f[col]))
|
||||
{
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
|
||||
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
d[col & 3] = v;
|
||||
|
||||
if (col >= 2)
|
||||
p_dst[col - 2] = d[(col - 2) & 3];
|
||||
}
|
||||
|
||||
/* handle the last two pixels */
|
||||
p_dst[col - 2] = d[(col - 2) & 3];
|
||||
p_dst[col - 1] = d[(col - 1) & 3];
|
||||
|
||||
/* next row */
|
||||
src_ptr += src_pixels_per_line;
|
||||
dst_ptr += dst_pixels_per_line;
|
||||
}
|
||||
}
|
||||
|
||||
static int q2mbl(int x)
|
||||
{
|
||||
if (x < 20) x = 20;
|
||||
@@ -216,108 +86,13 @@ static int q2mbl(int x)
|
||||
return x * x / 3;
|
||||
}
|
||||
|
||||
void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
|
||||
{
|
||||
int r, c, i;
|
||||
|
||||
unsigned char *s = src;
|
||||
unsigned char d[16];
|
||||
|
||||
for (r = 0; r < rows; r++)
|
||||
{
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
|
||||
for (i = -8; i < 0; i++)
|
||||
s[i]=s[0];
|
||||
|
||||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++)
|
||||
s[i+cols]=s[cols-1];
|
||||
|
||||
for (i = -8; i <= 6; i++)
|
||||
{
|
||||
sumsq += s[i] * s[i];
|
||||
sum += s[i];
|
||||
d[i+8] = 0;
|
||||
}
|
||||
|
||||
for (c = 0; c < cols + 8; c++)
|
||||
{
|
||||
int x = s[c+7] - s[c-8];
|
||||
int y = s[c+7] + s[c-8];
|
||||
|
||||
sum += x;
|
||||
sumsq += x * y;
|
||||
|
||||
d[c&15] = s[c];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit)
|
||||
{
|
||||
d[c&15] = (8 + sum + s[c]) >> 4;
|
||||
}
|
||||
|
||||
s[c-8] = d[(c-8)&15];
|
||||
}
|
||||
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
|
||||
{
|
||||
int r, c, i;
|
||||
const short *rv3 = &vp8_rv[63&rand()];
|
||||
|
||||
for (c = 0; c < cols; c++ )
|
||||
{
|
||||
unsigned char *s = &dst[c];
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
unsigned char d[16];
|
||||
const short *rv2 = rv3 + ((c * 17) & 127);
|
||||
|
||||
for (i = -8; i < 0; i++)
|
||||
s[i*pitch]=s[0];
|
||||
|
||||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++)
|
||||
s[(i+rows)*pitch]=s[(rows-1)*pitch];
|
||||
|
||||
for (i = -8; i <= 6; i++)
|
||||
{
|
||||
sumsq += s[i*pitch] * s[i*pitch];
|
||||
sum += s[i*pitch];
|
||||
}
|
||||
|
||||
for (r = 0; r < rows + 8; r++)
|
||||
{
|
||||
sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch];
|
||||
sum += s[7*pitch] - s[-8*pitch];
|
||||
d[r&15] = s[0];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit)
|
||||
{
|
||||
d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
|
||||
}
|
||||
if (r >= 8)
|
||||
s[-8*pitch] = d[(r-8)&15];
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_POSTPROC
|
||||
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
|
||||
int q)
|
||||
{
|
||||
vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
|
||||
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
|
||||
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
}
|
||||
|
||||
@@ -365,16 +140,16 @@ void vp8_deblock(VP8_COMMON *cm,
|
||||
}
|
||||
mode_info_context++;
|
||||
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
|
||||
post->y_stride, source->y_width, ylimits, 16);
|
||||
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
|
||||
post->uv_stride, source->uv_width, uvlimits, 8);
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
|
||||
post->uv_stride, source->uv_width, uvlimits, 8);
|
||||
@@ -409,17 +184,17 @@ void vp8_de_noise(VP8_COMMON *cm,
|
||||
/* TODO: The original code don't filter the 2 outer rows and columns. */
|
||||
for (mbr = 0; mbr < mb_rows; mbr++)
|
||||
{
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
source->y_stride, source->y_stride, source->y_width, limits, 16);
|
||||
if (uvfilter == 1) {
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits,
|
||||
8);
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits,
|
||||
@@ -428,69 +203,6 @@ void vp8_de_noise(VP8_COMMON *cm,
|
||||
}
|
||||
}
|
||||
|
||||
static double gaussian(double sigma, double mu, double x)
|
||||
{
|
||||
return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
|
||||
(exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
|
||||
}
|
||||
|
||||
static void fillrd(struct postproc_state *state, int q, int a)
|
||||
{
|
||||
char char_dist[300];
|
||||
|
||||
double sigma;
|
||||
int i;
|
||||
|
||||
vp8_clear_system_state();
|
||||
|
||||
|
||||
sigma = a + .5 + .6 * (63 - q) / 63.0;
|
||||
|
||||
/* set up a lookup table of 256 entries that matches
|
||||
* a gaussian distribution with sigma determined by q.
|
||||
*/
|
||||
{
|
||||
int next, j;
|
||||
|
||||
next = 0;
|
||||
|
||||
for (i = -32; i < 32; i++)
|
||||
{
|
||||
const int v = (int)(.5 + 256 * gaussian(sigma, 0, i));
|
||||
|
||||
if (v)
|
||||
{
|
||||
for (j = 0; j < v; j++)
|
||||
{
|
||||
char_dist[next+j] = (char) i;
|
||||
}
|
||||
|
||||
next = next + j;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (; next < 256; next++)
|
||||
char_dist[next] = 0;
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < 3072; i++)
|
||||
{
|
||||
state->noise[i] = char_dist[rand() & 0xff];
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
state->blackclamp[i] = -char_dist[0];
|
||||
state->whiteclamp[i] = -char_dist[0];
|
||||
state->bothclamp[i] = -2 * char_dist[0];
|
||||
}
|
||||
|
||||
state->last_q = q;
|
||||
state->last_noise = a;
|
||||
}
|
||||
|
||||
/* Blend the macro block with a solid colored square. Leave the
|
||||
* edges unblended to give distinction to macro blocks in areas
|
||||
* filled with the same color block.
|
||||
@@ -778,7 +490,22 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
|
||||
if (oci->postproc_state.last_q != q
|
||||
|| oci->postproc_state.last_noise != noise_level)
|
||||
{
|
||||
fillrd(&oci->postproc_state, 63 - q, noise_level);
|
||||
double sigma;
|
||||
int clamp, i;
|
||||
struct postproc_state *ppstate = &oci->postproc_state;
|
||||
vp8_clear_system_state();
|
||||
sigma = noise_level + .5 + .6 * q / 63.0;
|
||||
clamp = vpx_setup_noise(sigma, sizeof(ppstate->noise),
|
||||
ppstate->noise);
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
ppstate->blackclamp[i] = clamp;
|
||||
ppstate->whiteclamp[i] = clamp;
|
||||
ppstate->bothclamp[i] = 2 * clamp;
|
||||
}
|
||||
|
||||
ppstate->last_q = q;
|
||||
ppstate->last_noise = noise_level;
|
||||
}
|
||||
|
||||
vpx_plane_add_noise
|
||||
|
@@ -156,16 +156,6 @@ $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
|
||||
# Postproc
|
||||
#
|
||||
if (vpx_config("CONFIG_POSTPROC") eq "yes") {
|
||||
add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
|
||||
specialize qw/vp8_mbpost_proc_down mmx sse2 msa/;
|
||||
$vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
|
||||
|
||||
add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
|
||||
specialize qw/vp8_mbpost_proc_across_ip sse2 msa/;
|
||||
$vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
|
||||
|
||||
add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
|
||||
specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
|
||||
|
||||
add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
|
||||
# no asm yet
|
||||
|
@@ -45,7 +45,7 @@ sym(vp8_filter_by_weight16x16_sse2):
|
||||
mov rcx, 16 ; loop count
|
||||
pxor xmm6, xmm6
|
||||
|
||||
.combine
|
||||
.combine:
|
||||
movdqa xmm2, [rax]
|
||||
movdqa xmm4, [rdx]
|
||||
add rax, rsi
|
||||
@@ -122,7 +122,7 @@ sym(vp8_filter_by_weight8x8_sse2):
|
||||
mov rcx, 8 ; loop count
|
||||
pxor xmm4, xmm4
|
||||
|
||||
.combine
|
||||
.combine:
|
||||
movq xmm2, [rax]
|
||||
movq xmm3, [rdx]
|
||||
add rax, rsi
|
||||
@@ -189,7 +189,7 @@ sym(vp8_variance_and_sad_16x16_sse2):
|
||||
|
||||
; Because we're working with the actual output frames
|
||||
; we can't depend on any kind of data alignment.
|
||||
.accumulate
|
||||
.accumulate:
|
||||
movdqa xmm0, [rax] ; src1
|
||||
movdqa xmm1, [rdx] ; src2
|
||||
add rax, rcx ; src1 + stride1
|
||||
|
@@ -1,253 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
%define VP8_FILTER_WEIGHT 128
|
||||
%define VP8_FILTER_SHIFT 7
|
||||
|
||||
;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
extern sym(vp8_rv)
|
||||
global sym(vp8_mbpost_proc_down_mmx) PRIVATE
|
||||
sym(vp8_mbpost_proc_down_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 136
|
||||
|
||||
; unsigned char d[16][8] at [rsp]
|
||||
; create flimit2 at [rsp+128]
|
||||
mov eax, dword ptr arg(4) ;flimit
|
||||
mov [rsp+128], eax
|
||||
mov [rsp+128+4], eax
|
||||
%define flimit2 [rsp+128]
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
lea r8, [GLOBAL(sym(vp8_rv))]
|
||||
%endif
|
||||
|
||||
;rows +=8;
|
||||
add dword ptr arg(2), 8
|
||||
|
||||
;for(c=0; c<cols; c+=4)
|
||||
.loop_col:
|
||||
mov rsi, arg(0) ;s
|
||||
pxor mm0, mm0 ;
|
||||
|
||||
movsxd rax, dword ptr arg(1) ;pitch ;
|
||||
|
||||
; this copies the last row down into the border 8 rows
|
||||
mov rdi, rsi
|
||||
mov rdx, arg(2)
|
||||
sub rdx, 9
|
||||
imul rdx, rax
|
||||
lea rdi, [rdi+rdx]
|
||||
movq mm1, QWORD ptr[rdi] ; first row
|
||||
mov rcx, 8
|
||||
.init_borderd ; initialize borders
|
||||
lea rdi, [rdi + rax]
|
||||
movq [rdi], mm1
|
||||
|
||||
dec rcx
|
||||
jne .init_borderd
|
||||
|
||||
neg rax ; rax = -pitch
|
||||
|
||||
; this copies the first row up into the border 8 rows
|
||||
mov rdi, rsi
|
||||
movq mm1, QWORD ptr[rdi] ; first row
|
||||
mov rcx, 8
|
||||
.init_border ; initialize borders
|
||||
lea rdi, [rdi + rax]
|
||||
movq [rdi], mm1
|
||||
|
||||
dec rcx
|
||||
jne .init_border
|
||||
|
||||
|
||||
lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
|
||||
neg rax
|
||||
|
||||
|
||||
pxor mm5, mm5
|
||||
pxor mm6, mm6 ;
|
||||
|
||||
pxor mm7, mm7 ;
|
||||
mov rdi, rsi
|
||||
|
||||
mov rcx, 15 ;
|
||||
|
||||
.loop_initvar:
|
||||
movd mm1, DWORD PTR [rdi];
|
||||
punpcklbw mm1, mm0 ;
|
||||
|
||||
paddw mm5, mm1 ;
|
||||
pmullw mm1, mm1 ;
|
||||
|
||||
movq mm2, mm1 ;
|
||||
punpcklwd mm1, mm0 ;
|
||||
|
||||
punpckhwd mm2, mm0 ;
|
||||
paddd mm6, mm1 ;
|
||||
|
||||
paddd mm7, mm2 ;
|
||||
lea rdi, [rdi+rax] ;
|
||||
|
||||
dec rcx
|
||||
jne .loop_initvar
|
||||
;save the var and sum
|
||||
xor rdx, rdx
|
||||
.loop_row:
|
||||
movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
|
||||
movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
|
||||
|
||||
punpcklbw mm1, mm0
|
||||
punpcklbw mm2, mm0
|
||||
|
||||
paddw mm5, mm2
|
||||
psubw mm5, mm1
|
||||
|
||||
pmullw mm2, mm2
|
||||
movq mm4, mm2
|
||||
|
||||
punpcklwd mm2, mm0
|
||||
punpckhwd mm4, mm0
|
||||
|
||||
paddd mm6, mm2
|
||||
paddd mm7, mm4
|
||||
|
||||
pmullw mm1, mm1
|
||||
movq mm2, mm1
|
||||
|
||||
punpcklwd mm1, mm0
|
||||
psubd mm6, mm1
|
||||
|
||||
punpckhwd mm2, mm0
|
||||
psubd mm7, mm2
|
||||
|
||||
|
||||
movq mm3, mm6
|
||||
pslld mm3, 4
|
||||
|
||||
psubd mm3, mm6
|
||||
movq mm1, mm5
|
||||
|
||||
movq mm4, mm5
|
||||
pmullw mm1, mm1
|
||||
|
||||
pmulhw mm4, mm4
|
||||
movq mm2, mm1
|
||||
|
||||
punpcklwd mm1, mm4
|
||||
punpckhwd mm2, mm4
|
||||
|
||||
movq mm4, mm7
|
||||
pslld mm4, 4
|
||||
|
||||
psubd mm4, mm7
|
||||
|
||||
psubd mm3, mm1
|
||||
psubd mm4, mm2
|
||||
|
||||
psubd mm3, flimit2
|
||||
psubd mm4, flimit2
|
||||
|
||||
psrad mm3, 31
|
||||
psrad mm4, 31
|
||||
|
||||
packssdw mm3, mm4
|
||||
packsswb mm3, mm0
|
||||
|
||||
movd mm1, DWORD PTR [rsi+rax*8]
|
||||
|
||||
movq mm2, mm1
|
||||
punpcklbw mm1, mm0
|
||||
|
||||
paddw mm1, mm5
|
||||
mov rcx, rdx
|
||||
|
||||
and rcx, 127
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
push rax
|
||||
lea rax, [GLOBAL(sym(vp8_rv))]
|
||||
movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
|
||||
pop rax
|
||||
%elif ABI_IS_32BIT=0
|
||||
movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
|
||||
%else
|
||||
movq mm4, [sym(vp8_rv) + rcx*2]
|
||||
%endif
|
||||
paddw mm1, mm4
|
||||
psraw mm1, 4
|
||||
|
||||
packuswb mm1, mm0
|
||||
pand mm1, mm3
|
||||
|
||||
pandn mm3, mm2
|
||||
por mm1, mm3
|
||||
|
||||
and rcx, 15
|
||||
movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
|
||||
|
||||
cmp edx, 8
|
||||
jl .skip_assignment
|
||||
|
||||
mov rcx, rdx
|
||||
sub rcx, 8
|
||||
and rcx, 15
|
||||
movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
|
||||
movd [rsi], mm1
|
||||
|
||||
.skip_assignment
|
||||
lea rsi, [rsi+rax]
|
||||
|
||||
lea rdi, [rdi+rax]
|
||||
add rdx, 1
|
||||
|
||||
cmp edx, dword arg(2) ;rows
|
||||
jl .loop_row
|
||||
|
||||
|
||||
add dword arg(0), 4 ; s += 4
|
||||
sub dword arg(3), 4 ; cols -= 4
|
||||
cmp dword arg(3), 0
|
||||
jg .loop_col
|
||||
|
||||
add rsp, 136
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef flimit2
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
Blur:
|
||||
times 16 dw 16
|
||||
times 8 dw 64
|
||||
times 16 dw 16
|
||||
times 8 dw 0
|
||||
|
||||
rd:
|
||||
times 4 dw 0x40
|
@@ -96,9 +96,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
|
||||
|
||||
ifeq ($(CONFIG_POSTPROC),yes)
|
||||
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
|
||||
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
@@ -123,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
|
||||
|
||||
ifeq ($(CONFIG_POSTPROC),yes)
|
||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/postproc_msa.c
|
||||
endif
|
||||
|
||||
# common (c)
|
||||
|
@@ -103,6 +103,8 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) {
|
||||
#if CONFIG_VP9_POSTPROC
|
||||
vpx_free_frame_buffer(&cm->post_proc_buffer);
|
||||
vpx_free_frame_buffer(&cm->post_proc_buffer_int);
|
||||
vpx_free(cm->postproc_state.limits);
|
||||
cm->postproc_state.limits = 0;
|
||||
#else
|
||||
(void)cm;
|
||||
#endif
|
||||
|
@@ -18,6 +18,7 @@
|
||||
#include "./vp9_rtcd.h"
|
||||
|
||||
#include "vpx_dsp/vpx_dsp_common.h"
|
||||
#include "vpx_dsp/postproc.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx_ports/system_state.h"
|
||||
#include "vpx_scale/vpx_scale.h"
|
||||
@@ -32,128 +33,9 @@ static const int16_t kernel5[] = {
|
||||
1, 1, 4, 1, 1
|
||||
};
|
||||
|
||||
const int16_t vp9_rv[] = {
|
||||
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
|
||||
0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
|
||||
10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
|
||||
8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
|
||||
8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
|
||||
1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
|
||||
3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
|
||||
11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
|
||||
14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
|
||||
4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
|
||||
7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
|
||||
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
|
||||
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
|
||||
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
|
||||
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
|
||||
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
|
||||
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
|
||||
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
|
||||
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
|
||||
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
|
||||
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
|
||||
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
|
||||
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
|
||||
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
|
||||
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
|
||||
0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
|
||||
8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
|
||||
3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
|
||||
3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
|
||||
13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
|
||||
5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
|
||||
9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
|
||||
4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
|
||||
3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
|
||||
11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
|
||||
5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
|
||||
0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
|
||||
10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
|
||||
4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
|
||||
3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
|
||||
11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
|
||||
14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
|
||||
5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
|
||||
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
|
||||
};
|
||||
|
||||
static const uint8_t q_diff_thresh = 20;
|
||||
static const uint8_t last_q_thresh = 170;
|
||||
|
||||
void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
|
||||
uint8_t *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int rows,
|
||||
int cols,
|
||||
int flimit) {
|
||||
uint8_t const *p_src;
|
||||
uint8_t *p_dst;
|
||||
int row, col, i, v, kernel;
|
||||
int pitch = src_pixels_per_line;
|
||||
uint8_t d[8];
|
||||
(void)dst_pixels_per_line;
|
||||
|
||||
for (row = 0; row < rows; row++) {
|
||||
/* post_proc_down for one row */
|
||||
p_src = src_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
for (col = 0; col < cols; col++) {
|
||||
kernel = 4;
|
||||
v = p_src[col];
|
||||
|
||||
for (i = -2; i <= 2; i++) {
|
||||
if (abs(v - p_src[col + i * pitch]) > flimit)
|
||||
goto down_skip_convolve;
|
||||
|
||||
kernel += kernel5[2 + i] * p_src[col + i * pitch];
|
||||
}
|
||||
|
||||
v = (kernel >> 3);
|
||||
down_skip_convolve:
|
||||
p_dst[col] = v;
|
||||
}
|
||||
|
||||
/* now post_proc_across */
|
||||
p_src = dst_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
d[i] = p_src[i];
|
||||
|
||||
for (col = 0; col < cols; col++) {
|
||||
kernel = 4;
|
||||
v = p_src[col];
|
||||
|
||||
d[col & 7] = v;
|
||||
|
||||
for (i = -2; i <= 2; i++) {
|
||||
if (abs(v - p_src[col + i]) > flimit)
|
||||
goto across_skip_convolve;
|
||||
|
||||
kernel += kernel5[2 + i] * p_src[col + i];
|
||||
}
|
||||
|
||||
d[col & 7] = (kernel >> 3);
|
||||
across_skip_convolve:
|
||||
|
||||
if (col >= 2)
|
||||
p_dst[col - 2] = d[(col - 2) & 7];
|
||||
}
|
||||
|
||||
/* handle the last two pixels */
|
||||
p_dst[col - 2] = d[(col - 2) & 7];
|
||||
p_dst[col - 1] = d[(col - 1) & 7];
|
||||
|
||||
|
||||
/* next row */
|
||||
src_ptr += pitch;
|
||||
dst_ptr += pitch;
|
||||
}
|
||||
}
|
||||
extern const int16_t vpx_rv[];
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
|
||||
@@ -237,41 +119,6 @@ static int q2mbl(int x) {
|
||||
return x * x / 3;
|
||||
}
|
||||
|
||||
void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
|
||||
int rows, int cols, int flimit) {
|
||||
int r, c, i;
|
||||
uint8_t *s = src;
|
||||
uint8_t d[16];
|
||||
|
||||
for (r = 0; r < rows; r++) {
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
|
||||
for (i = -8; i <= 6; i++) {
|
||||
sumsq += s[i] * s[i];
|
||||
sum += s[i];
|
||||
d[i + 8] = 0;
|
||||
}
|
||||
|
||||
for (c = 0; c < cols + 8; c++) {
|
||||
int x = s[c + 7] - s[c - 8];
|
||||
int y = s[c + 7] + s[c - 8];
|
||||
|
||||
sum += x;
|
||||
sumsq += x * y;
|
||||
|
||||
d[c & 15] = s[c];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit) {
|
||||
d[c & 15] = (8 + sum + s[c]) >> 4;
|
||||
}
|
||||
|
||||
s[c - 8] = d[(c - 8) & 15];
|
||||
}
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
|
||||
int rows, int cols, int flimit) {
|
||||
@@ -312,43 +159,12 @@ void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch,
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
|
||||
int rows, int cols, int flimit) {
|
||||
int r, c, i;
|
||||
const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT
|
||||
|
||||
for (c = 0; c < cols; c++) {
|
||||
uint8_t *s = &dst[c];
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
uint8_t d[16];
|
||||
const int16_t *rv2 = rv3 + ((c * 17) & 127);
|
||||
|
||||
for (i = -8; i <= 6; i++) {
|
||||
sumsq += s[i * pitch] * s[i * pitch];
|
||||
sum += s[i * pitch];
|
||||
}
|
||||
|
||||
for (r = 0; r < rows + 8; r++) {
|
||||
sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
|
||||
sum += s[7 * pitch] - s[-8 * pitch];
|
||||
d[r & 15] = s[0];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit) {
|
||||
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
|
||||
}
|
||||
|
||||
s[-8 * pitch] = d[(r - 8) & 15];
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch,
|
||||
int rows, int cols, int flimit) {
|
||||
int r, c, i;
|
||||
const int16_t *rv3 = &vp9_rv[63 & rand()]; // NOLINT
|
||||
const int16_t *rv3 = &vpx_rv[63 & rand()]; // NOLINT
|
||||
|
||||
for (c = 0; c < cols; c++) {
|
||||
uint16_t *s = &dst[c];
|
||||
@@ -382,14 +198,14 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
|
||||
YV12_BUFFER_CONFIG *post,
|
||||
int q,
|
||||
int low_var_thresh,
|
||||
int flag) {
|
||||
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
|
||||
int ppl = (int)(level + .5);
|
||||
int flag,
|
||||
uint8_t *limits) {
|
||||
(void) low_var_thresh;
|
||||
(void) flag;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
|
||||
int ppl = (int)(level + .5);
|
||||
vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(source->y_buffer),
|
||||
CONVERT_TO_SHORTPTR(post->y_buffer),
|
||||
source->y_stride, post->y_stride,
|
||||
@@ -415,177 +231,68 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source,
|
||||
source->uv_height, source->uv_width,
|
||||
ppl);
|
||||
} else {
|
||||
vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
|
||||
source->y_stride, post->y_stride,
|
||||
source->y_height, source->y_width, ppl);
|
||||
|
||||
vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
vp9_deblock(source, post, q, limits);
|
||||
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
|
||||
vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
|
||||
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
|
||||
vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
|
||||
source->uv_stride, post->uv_stride,
|
||||
source->uv_height, source->uv_width, ppl);
|
||||
vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
|
||||
source->uv_stride, post->uv_stride,
|
||||
source->uv_height, source->uv_width, ppl);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
#else
|
||||
vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
|
||||
source->y_stride, post->y_stride,
|
||||
source->y_height, source->y_width, ppl);
|
||||
|
||||
vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
|
||||
vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
|
||||
vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
|
||||
source->uv_stride, post->uv_stride,
|
||||
source->uv_height, source->uv_width, ppl);
|
||||
vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
|
||||
source->uv_stride, post->uv_stride,
|
||||
source->uv_height, source->uv_width, ppl);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
|
||||
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
|
||||
int q) {
|
||||
int q, uint8_t *limits) {
|
||||
const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
|
||||
+ 0.0065 + 0.5);
|
||||
int i;
|
||||
|
||||
const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
|
||||
const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
|
||||
const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
|
||||
const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
|
||||
|
||||
uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
|
||||
const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
|
||||
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
|
||||
(dst->flags & YV12_FLAG_HIGHBITDEPTH));
|
||||
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
int i;
|
||||
const uint8_t * const srcs[3] =
|
||||
{src->y_buffer, src->u_buffer, src->v_buffer};
|
||||
const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
|
||||
const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
|
||||
const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
|
||||
|
||||
uint8_t * const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
|
||||
const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
vp9_highbd_post_proc_down_and_across(CONVERT_TO_SHORTPTR(srcs[i]),
|
||||
CONVERT_TO_SHORTPTR(dsts[i]),
|
||||
src_strides[i], dst_strides[i],
|
||||
src_heights[i], src_widths[i], ppl);
|
||||
} else {
|
||||
vp9_post_proc_down_and_across(srcs[i], dsts[i],
|
||||
src_strides[i], dst_strides[i],
|
||||
src_heights[i], src_widths[i], ppl);
|
||||
}
|
||||
#else
|
||||
vp9_post_proc_down_and_across(srcs[i], dsts[i],
|
||||
src_strides[i], dst_strides[i],
|
||||
src_heights[i], src_widths[i], ppl);
|
||||
} else {
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
int mbr;
|
||||
const int mb_rows = src->y_height / 16;
|
||||
const int mb_cols = src->y_width / 16;
|
||||
|
||||
memset(limits, (unsigned char) ppl, 16 * mb_cols);
|
||||
|
||||
for (mbr = 0; mbr < mb_rows; mbr++) {
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
src->y_buffer + 16 * mbr * src->y_stride,
|
||||
dst->y_buffer + 16 * mbr * dst->y_stride, src->y_stride,
|
||||
dst->y_stride, src->y_width, limits, 16);
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
src->u_buffer + 8 * mbr * src->uv_stride,
|
||||
dst->u_buffer + 8 * mbr * dst->uv_stride, src->uv_stride,
|
||||
dst->uv_stride, src->uv_width, limits, 8);
|
||||
vpx_post_proc_down_and_across_mb_row(
|
||||
src->v_buffer + 8 * mbr * src->uv_stride,
|
||||
dst->v_buffer + 8 * mbr * dst->uv_stride, src->uv_stride,
|
||||
dst->uv_stride, src->uv_width, limits, 8);
|
||||
}
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
|
||||
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
|
||||
int q) {
|
||||
const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
|
||||
+ 0.0065 + 0.5);
|
||||
int i;
|
||||
|
||||
const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
|
||||
const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
|
||||
const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
|
||||
const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
|
||||
|
||||
uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
|
||||
const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
|
||||
|
||||
for (i = 0; i < MAX_MB_PLANE; ++i) {
|
||||
const int src_stride = src_strides[i];
|
||||
const int src_width = src_widths[i] - 4;
|
||||
const int src_height = src_heights[i] - 4;
|
||||
const int dst_stride = dst_strides[i];
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
|
||||
(dst->flags & YV12_FLAG_HIGHBITDEPTH));
|
||||
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
|
||||
srcs[i] + 2 * src_stride + 2);
|
||||
uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
|
||||
dsts[i] + 2 * dst_stride + 2);
|
||||
vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
|
||||
dst_stride, src_height, src_width,
|
||||
ppl);
|
||||
} else {
|
||||
const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
|
||||
uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
|
||||
|
||||
vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
|
||||
dst_stride, src_height, src_width, ppl);
|
||||
}
|
||||
#else
|
||||
const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
|
||||
uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
|
||||
vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
|
||||
src_height, src_width, ppl);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static double gaussian(double sigma, double mu, double x) {
|
||||
return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
|
||||
(exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
|
||||
}
|
||||
|
||||
static void fillrd(struct postproc_state *state, int q, int a) {
|
||||
char char_dist[300];
|
||||
|
||||
double sigma;
|
||||
int ai = a, qi = q, i;
|
||||
|
||||
vpx_clear_system_state();
|
||||
|
||||
sigma = ai + .5 + .6 * (63 - qi) / 63.0;
|
||||
|
||||
/* set up a lookup table of 256 entries that matches
|
||||
* a gaussian distribution with sigma determined by q.
|
||||
*/
|
||||
{
|
||||
int next, j;
|
||||
|
||||
next = 0;
|
||||
|
||||
for (i = -32; i < 32; i++) {
|
||||
int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
|
||||
|
||||
if (a_i) {
|
||||
for (j = 0; j < a_i; j++) {
|
||||
char_dist[next + j] = (char) i;
|
||||
}
|
||||
|
||||
next = next + j;
|
||||
}
|
||||
}
|
||||
|
||||
for (; next < 256; next++)
|
||||
char_dist[next] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < 3072; i++) {
|
||||
state->noise[i] = char_dist[rand() & 0xff]; // NOLINT
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
state->blackclamp[i] = -char_dist[0];
|
||||
state->whiteclamp[i] = -char_dist[0];
|
||||
state->bothclamp[i] = -2 * char_dist[0];
|
||||
}
|
||||
|
||||
state->last_q = q;
|
||||
state->last_noise = a;
|
||||
int q, uint8_t *limits) {
|
||||
vp9_deblock(src, dst, q, limits);
|
||||
}
|
||||
|
||||
static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
|
||||
@@ -663,6 +370,14 @@ int vp9_post_proc_frame(struct VP9Common *cm,
|
||||
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
|
||||
"Failed to allocate post-processing buffer");
|
||||
|
||||
if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) {
|
||||
if (!cm->postproc_state.limits) {
|
||||
cm->postproc_state.limits = vpx_calloc(
|
||||
cm->width, sizeof(*cm->postproc_state.limits));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
|
||||
cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
|
||||
cm->postproc_state.last_base_qindex <= last_q_thresh &&
|
||||
@@ -677,17 +392,19 @@ int vp9_post_proc_frame(struct VP9Common *cm,
|
||||
if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
|
||||
deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
|
||||
q + (ppflags->deblocking_level - 5) * 10,
|
||||
1, 0);
|
||||
1, 0, cm->postproc_state.limits);
|
||||
} else if (flags & VP9D_DEBLOCK) {
|
||||
vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
|
||||
vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
|
||||
cm->postproc_state.limits);
|
||||
} else {
|
||||
vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
|
||||
}
|
||||
} else if (flags & VP9D_DEMACROBLOCK) {
|
||||
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
|
||||
q + (ppflags->deblocking_level - 5) * 10, 1, 0);
|
||||
q + (ppflags->deblocking_level - 5) * 10, 1, 0,
|
||||
cm->postproc_state.limits);
|
||||
} else if (flags & VP9D_DEBLOCK) {
|
||||
vp9_deblock(cm->frame_to_show, ppbuf, q);
|
||||
vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
|
||||
} else {
|
||||
vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
|
||||
}
|
||||
@@ -699,7 +416,20 @@ int vp9_post_proc_frame(struct VP9Common *cm,
|
||||
const int noise_level = ppflags->noise_level;
|
||||
if (ppstate->last_q != q ||
|
||||
ppstate->last_noise != noise_level) {
|
||||
fillrd(ppstate, 63 - q, noise_level);
|
||||
double sigma;
|
||||
int clamp, i;
|
||||
vpx_clear_system_state();
|
||||
sigma = noise_level + .5 + .6 * q / 63.0;
|
||||
clamp = vpx_setup_noise(sigma, sizeof(ppstate->noise),
|
||||
ppstate->noise);
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
ppstate->blackclamp[i] = clamp;
|
||||
ppstate->whiteclamp[i] = clamp;
|
||||
ppstate->bothclamp[i] = 2 * clamp;
|
||||
}
|
||||
ppstate->last_q = q;
|
||||
ppstate->last_noise = noise_level;
|
||||
}
|
||||
vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
|
||||
ppstate->whiteclamp, ppstate->bothclamp,
|
||||
|
@@ -33,6 +33,7 @@ struct postproc_state {
|
||||
DECLARE_ALIGNED(16, char, blackclamp[16]);
|
||||
DECLARE_ALIGNED(16, char, whiteclamp[16]);
|
||||
DECLARE_ALIGNED(16, char, bothclamp[16]);
|
||||
uint8_t *limits;
|
||||
};
|
||||
|
||||
struct VP9Common;
|
||||
@@ -42,9 +43,11 @@ struct VP9Common;
|
||||
int vp9_post_proc_frame(struct VP9Common *cm,
|
||||
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
|
||||
|
||||
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
|
||||
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
|
||||
uint8_t *limits);
|
||||
|
||||
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
|
||||
void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
|
||||
uint8_t *limits);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
@@ -21,29 +21,6 @@ EOF
|
||||
}
|
||||
forward_decls qw/vp9_common_forward_decls/;
|
||||
|
||||
# x86inc.asm had specific constraints. break it out so it's easy to disable.
|
||||
# zero all the variables to avoid tricky else conditions.
|
||||
$mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = $avx_x86inc =
|
||||
$avx2_x86inc = '';
|
||||
$mmx_x86_64_x86inc = $sse_x86_64_x86inc = $sse2_x86_64_x86inc =
|
||||
$ssse3_x86_64_x86inc = $avx_x86_64_x86inc = $avx2_x86_64_x86inc = '';
|
||||
if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
|
||||
$mmx_x86inc = 'mmx';
|
||||
$sse_x86inc = 'sse';
|
||||
$sse2_x86inc = 'sse2';
|
||||
$ssse3_x86inc = 'ssse3';
|
||||
$avx_x86inc = 'avx';
|
||||
$avx2_x86inc = 'avx2';
|
||||
if ($opts{arch} eq "x86_64") {
|
||||
$mmx_x86_64_x86inc = 'mmx';
|
||||
$sse_x86_64_x86inc = 'sse';
|
||||
$sse2_x86_64_x86inc = 'sse2';
|
||||
$ssse3_x86_64_x86inc = 'ssse3';
|
||||
$avx_x86_64_x86inc = 'avx';
|
||||
$avx2_x86_64_x86inc = 'avx2';
|
||||
}
|
||||
}
|
||||
|
||||
# functions that are 64 bit only.
|
||||
$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
|
||||
if ($opts{arch} eq "x86_64") {
|
||||
@@ -58,18 +35,6 @@ if ($opts{arch} eq "x86_64") {
|
||||
# post proc
|
||||
#
|
||||
if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
|
||||
add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
|
||||
specialize qw/vp9_mbpost_proc_down sse2/;
|
||||
$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
|
||||
|
||||
add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
|
||||
specialize qw/vp9_mbpost_proc_across_ip sse2/;
|
||||
$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
|
||||
|
||||
add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
|
||||
specialize qw/vp9_post_proc_down_and_across sse2/;
|
||||
$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
|
||||
|
||||
add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
|
||||
specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
|
||||
|
||||
@@ -202,10 +167,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_block_error/;
|
||||
|
||||
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
|
||||
specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
|
||||
specialize qw/vp9_highbd_block_error sse2/;
|
||||
|
||||
add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
|
||||
specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
|
||||
specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
|
||||
|
||||
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_quantize_fp/;
|
||||
@@ -217,16 +182,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_fdct8x8_quant/;
|
||||
} else {
|
||||
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
|
||||
specialize qw/vp9_block_error avx2 msa/, "$sse2_x86inc";
|
||||
specialize qw/vp9_block_error avx2 msa sse2/;
|
||||
|
||||
add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
|
||||
specialize qw/vp9_block_error_fp neon/, "$sse2_x86inc";
|
||||
specialize qw/vp9_block_error_fp neon sse2/;
|
||||
|
||||
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64_x86inc";
|
||||
specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64_x86inc";
|
||||
specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
|
||||
@@ -245,7 +210,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_fht16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
|
||||
specialize qw/vp9_fwht4x4 sse2/;
|
||||
} else {
|
||||
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp9_fht4x4 sse2 msa/;
|
||||
@@ -257,7 +222,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
specialize qw/vp9_fht16x16 sse2 msa/;
|
||||
|
||||
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
|
||||
specialize qw/vp9_fwht4x4 msa sse2/;
|
||||
}
|
||||
|
||||
#
|
||||
|
@@ -46,7 +46,7 @@ sym(vp9_filter_by_weight16x16_sse2):
|
||||
mov rcx, 16 ; loop count
|
||||
pxor xmm6, xmm6
|
||||
|
||||
.combine
|
||||
.combine:
|
||||
movdqa xmm2, [rax]
|
||||
movdqa xmm4, [rdx]
|
||||
add rax, rsi
|
||||
@@ -123,7 +123,7 @@ sym(vp9_filter_by_weight8x8_sse2):
|
||||
mov rcx, 8 ; loop count
|
||||
pxor xmm4, xmm4
|
||||
|
||||
.combine
|
||||
.combine:
|
||||
movq xmm2, [rax]
|
||||
movq xmm3, [rdx]
|
||||
add rax, rsi
|
||||
@@ -190,7 +190,7 @@ sym(vp9_variance_and_sad_16x16_sse2):
|
||||
|
||||
; Because we're working with the actual output frames
|
||||
; we can't depend on any kind of data alignment.
|
||||
.accumulate
|
||||
.accumulate:
|
||||
movdqa xmm0, [rax] ; src1
|
||||
movdqa xmm1, [rdx] ; src2
|
||||
add rax, rcx ; src1 + stride1
|
||||
|
@@ -1,632 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp9_post_proc_down_and_across_xmm
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned char *dst_ptr,
|
||||
; int src_pixels_per_line,
|
||||
; int dst_pixels_per_line,
|
||||
; int rows,
|
||||
; int cols,
|
||||
; int flimit
|
||||
;)
|
||||
global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
|
||||
sym(vp9_post_proc_down_and_across_xmm):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
ALIGN_STACK 16, rax
|
||||
; move the global rd onto the stack, since we don't have enough registers
|
||||
; to do PIC addressing
|
||||
movdqa xmm0, [GLOBAL(rd42)]
|
||||
sub rsp, 16
|
||||
movdqa [rsp], xmm0
|
||||
%define RD42 [rsp]
|
||||
%else
|
||||
%define RD42 [GLOBAL(rd42)]
|
||||
%endif
|
||||
|
||||
|
||||
movd xmm2, dword ptr arg(6) ;flimit
|
||||
punpcklwd xmm2, xmm2
|
||||
punpckldq xmm2, xmm2
|
||||
punpcklqdq xmm2, xmm2
|
||||
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(1) ;dst_ptr
|
||||
|
||||
movsxd rcx, DWORD PTR arg(4) ;rows
|
||||
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
pxor xmm0, xmm0 ; mm0 = 00000000
|
||||
|
||||
.nextrow:
|
||||
|
||||
xor rdx, rdx ; clear out rdx for use as loop counter
|
||||
.nextcol:
|
||||
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
|
||||
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
|
||||
movdqa xmm1, xmm3 ; mm1 = p0..p3
|
||||
psllw xmm3, 2 ;
|
||||
|
||||
movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
|
||||
punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
|
||||
paddusw xmm3, xmm5 ; mm3 += mm6
|
||||
|
||||
; thresholding
|
||||
movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
|
||||
psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
|
||||
psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
|
||||
paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
|
||||
pcmpgtw xmm7, xmm2
|
||||
|
||||
movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
|
||||
punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
|
||||
psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
|
||||
neg rax
|
||||
movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
|
||||
punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
|
||||
psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
|
||||
punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
|
||||
paddusw xmm3, xmm4 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
|
||||
psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
|
||||
psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
|
||||
paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
|
||||
paddusw xmm3, RD42 ; mm3 += round value
|
||||
psraw xmm3, 3 ; mm3 /= 8
|
||||
|
||||
pand xmm1, xmm7 ; mm1 select vals > thresh from source
|
||||
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
|
||||
paddusw xmm1, xmm7 ; combination
|
||||
|
||||
packuswb xmm1, xmm0 ; pack to bytes
|
||||
movq QWORD PTR [rdi], xmm1 ;
|
||||
|
||||
neg rax ; pitch is positive
|
||||
add rsi, 8
|
||||
add rdi, 8
|
||||
|
||||
add rdx, 8
|
||||
cmp edx, dword arg(5) ;cols
|
||||
|
||||
jl .nextcol
|
||||
|
||||
; done with the all cols, start the across filtering in place
|
||||
sub rsi, rdx
|
||||
sub rdi, rdx
|
||||
|
||||
xor rdx, rdx
|
||||
movq mm0, QWORD PTR [rdi-8];
|
||||
|
||||
.acrossnextcol:
|
||||
movq xmm7, QWORD PTR [rdi +rdx -2]
|
||||
movd xmm4, DWORD PTR [rdi +rdx +6]
|
||||
|
||||
pslldq xmm4, 8
|
||||
por xmm4, xmm7
|
||||
|
||||
movdqa xmm3, xmm4
|
||||
psrldq xmm3, 2
|
||||
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
|
||||
movdqa xmm1, xmm3 ; mm1 = p0..p3
|
||||
psllw xmm3, 2
|
||||
|
||||
|
||||
movdqa xmm5, xmm4
|
||||
psrldq xmm5, 3
|
||||
punpcklbw xmm5, xmm0 ; mm5 = p1..p4
|
||||
paddusw xmm3, xmm5 ; mm3 += mm6
|
||||
|
||||
; thresholding
|
||||
movdqa xmm7, xmm1 ; mm7 = p0..p3
|
||||
psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
|
||||
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm7, xmm2
|
||||
|
||||
movdqa xmm5, xmm4
|
||||
psrldq xmm5, 4
|
||||
punpcklbw xmm5, xmm0 ; mm5 = p2..p5
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
|
||||
movdqa xmm5, xmm4 ; mm5 = p-2..p5
|
||||
punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
|
||||
paddusw xmm3, xmm5 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = p0..p3
|
||||
psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
psrldq xmm4, 1 ; mm4 = p-1..p5
|
||||
punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
|
||||
paddusw xmm3, xmm4 ; mm3 += mm5
|
||||
|
||||
; thresholding
|
||||
movdqa xmm6, xmm1 ; mm6 = p0..p3
|
||||
psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
|
||||
psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
|
||||
paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
|
||||
pcmpgtw xmm6, xmm2
|
||||
por xmm7, xmm6 ; accumulate thresholds
|
||||
|
||||
paddusw xmm3, RD42 ; mm3 += round value
|
||||
psraw xmm3, 3 ; mm3 /= 8
|
||||
|
||||
pand xmm1, xmm7 ; mm1 select vals > thresh from source
|
||||
pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
|
||||
paddusw xmm1, xmm7 ; combination
|
||||
|
||||
packuswb xmm1, xmm0 ; pack to bytes
|
||||
movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
|
||||
movdq2q mm0, xmm1
|
||||
|
||||
add rdx, 8
|
||||
cmp edx, dword arg(5) ;cols
|
||||
jl .acrossnextcol;
|
||||
|
||||
; last 8 pixels
|
||||
movq QWORD PTR [rdi+rdx-8], mm0
|
||||
|
||||
; done with this rwo
|
||||
add rsi,rax ; next line
|
||||
mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
|
||||
add rdi,rax ; next destination
|
||||
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
|
||||
dec rcx ; decrement count
|
||||
jnz .nextrow ; next row
|
||||
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
add rsp,16
|
||||
pop rsp
|
||||
%endif
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef RD42
|
||||
|
||||
|
||||
;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
extern sym(vp9_rv)
|
||||
global sym(vp9_mbpost_proc_down_xmm) PRIVATE
|
||||
sym(vp9_mbpost_proc_down_xmm):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 128+16
|
||||
|
||||
; unsigned char d[16][8] at [rsp]
|
||||
; create flimit2 at [rsp+128]
|
||||
mov eax, dword ptr arg(4) ;flimit
|
||||
mov [rsp+128], eax
|
||||
mov [rsp+128+4], eax
|
||||
mov [rsp+128+8], eax
|
||||
mov [rsp+128+12], eax
|
||||
%define flimit4 [rsp+128]
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
lea r8, [GLOBAL(sym(vp9_rv))]
|
||||
%endif
|
||||
|
||||
;rows +=8;
|
||||
add dword arg(2), 8
|
||||
|
||||
;for(c=0; c<cols; c+=8)
|
||||
.loop_col:
|
||||
mov rsi, arg(0) ; s
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
movsxd rax, dword ptr arg(1) ;pitch ;
|
||||
neg rax ; rax = -pitch
|
||||
|
||||
lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
|
||||
neg rax
|
||||
|
||||
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm6, xmm6 ;
|
||||
|
||||
pxor xmm7, xmm7 ;
|
||||
mov rdi, rsi
|
||||
|
||||
mov rcx, 15 ;
|
||||
|
||||
.loop_initvar:
|
||||
movq xmm1, QWORD PTR [rdi];
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
|
||||
paddw xmm5, xmm1 ;
|
||||
pmullw xmm1, xmm1 ;
|
||||
|
||||
movdqa xmm2, xmm1 ;
|
||||
punpcklwd xmm1, xmm0 ;
|
||||
|
||||
punpckhwd xmm2, xmm0 ;
|
||||
paddd xmm6, xmm1 ;
|
||||
|
||||
paddd xmm7, xmm2 ;
|
||||
lea rdi, [rdi+rax] ;
|
||||
|
||||
dec rcx
|
||||
jne .loop_initvar
|
||||
;save the var and sum
|
||||
xor rdx, rdx
|
||||
.loop_row:
|
||||
movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
|
||||
movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpcklbw xmm2, xmm0
|
||||
|
||||
paddw xmm5, xmm2
|
||||
psubw xmm5, xmm1
|
||||
|
||||
pmullw xmm2, xmm2
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
punpcklwd xmm2, xmm0
|
||||
punpckhwd xmm4, xmm0
|
||||
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm7, xmm4
|
||||
|
||||
pmullw xmm1, xmm1
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
punpcklwd xmm1, xmm0
|
||||
psubd xmm6, xmm1
|
||||
|
||||
punpckhwd xmm2, xmm0
|
||||
psubd xmm7, xmm2
|
||||
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
pslld xmm3, 4
|
||||
|
||||
psubd xmm3, xmm6
|
||||
movdqa xmm1, xmm5
|
||||
|
||||
movdqa xmm4, xmm5
|
||||
pmullw xmm1, xmm1
|
||||
|
||||
pmulhw xmm4, xmm4
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
punpcklwd xmm1, xmm4
|
||||
punpckhwd xmm2, xmm4
|
||||
|
||||
movdqa xmm4, xmm7
|
||||
pslld xmm4, 4
|
||||
|
||||
psubd xmm4, xmm7
|
||||
|
||||
psubd xmm3, xmm1
|
||||
psubd xmm4, xmm2
|
||||
|
||||
psubd xmm3, flimit4
|
||||
psubd xmm4, flimit4
|
||||
|
||||
psrad xmm3, 31
|
||||
psrad xmm4, 31
|
||||
|
||||
packssdw xmm3, xmm4
|
||||
packsswb xmm3, xmm0
|
||||
|
||||
movq xmm1, QWORD PTR [rsi+rax*8]
|
||||
|
||||
movq xmm2, xmm1
|
||||
punpcklbw xmm1, xmm0
|
||||
|
||||
paddw xmm1, xmm5
|
||||
mov rcx, rdx
|
||||
|
||||
and rcx, 127
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
push rax
|
||||
lea rax, [GLOBAL(sym(vp9_rv))]
|
||||
movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2]
|
||||
pop rax
|
||||
%elif ABI_IS_32BIT=0
|
||||
movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
|
||||
%else
|
||||
movdqu xmm4, [sym(vp9_rv) + rcx*2]
|
||||
%endif
|
||||
|
||||
paddw xmm1, xmm4
|
||||
;paddw xmm1, eight8s
|
||||
psraw xmm1, 4
|
||||
|
||||
packuswb xmm1, xmm0
|
||||
pand xmm1, xmm3
|
||||
|
||||
pandn xmm3, xmm2
|
||||
por xmm1, xmm3
|
||||
|
||||
and rcx, 15
|
||||
movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
|
||||
|
||||
mov rcx, rdx
|
||||
sub rcx, 8
|
||||
|
||||
and rcx, 15
|
||||
movq mm0, [rsp + rcx*8] ;d[rcx*8]
|
||||
|
||||
movq [rsi], mm0
|
||||
lea rsi, [rsi+rax]
|
||||
|
||||
lea rdi, [rdi+rax]
|
||||
add rdx, 1
|
||||
|
||||
cmp edx, dword arg(2) ;rows
|
||||
jl .loop_row
|
||||
|
||||
add dword arg(0), 8 ; s += 8
|
||||
sub dword arg(3), 8 ; cols -= 8
|
||||
cmp dword arg(3), 0
|
||||
jg .loop_col
|
||||
|
||||
add rsp, 128+16
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef flimit4
|
||||
|
||||
|
||||
;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
|
||||
sym(vp9_mbpost_proc_across_ip_xmm):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
ALIGN_STACK 16, rax
|
||||
sub rsp, 16
|
||||
|
||||
; create flimit4 at [rsp]
|
||||
mov eax, dword ptr arg(4) ;flimit
|
||||
mov [rsp], eax
|
||||
mov [rsp+4], eax
|
||||
mov [rsp+8], eax
|
||||
mov [rsp+12], eax
|
||||
%define flimit4 [rsp]
|
||||
|
||||
|
||||
;for(r=0;r<rows;r++)
|
||||
.ip_row_loop:
|
||||
|
||||
xor rdx, rdx ;sumsq=0;
|
||||
xor rcx, rcx ;sum=0;
|
||||
mov rsi, arg(0); s
|
||||
mov rdi, -8
|
||||
.ip_var_loop:
|
||||
;for(i=-8;i<=6;i++)
|
||||
;{
|
||||
; sumsq += s[i]*s[i];
|
||||
; sum += s[i];
|
||||
;}
|
||||
movzx eax, byte [rsi+rdi]
|
||||
add ecx, eax
|
||||
mul al
|
||||
add edx, eax
|
||||
add rdi, 1
|
||||
cmp rdi, 6
|
||||
jle .ip_var_loop
|
||||
|
||||
|
||||
;mov rax, sumsq
|
||||
;movd xmm7, rax
|
||||
movd xmm7, edx
|
||||
|
||||
;mov rax, sum
|
||||
;movd xmm6, rax
|
||||
movd xmm6, ecx
|
||||
|
||||
mov rsi, arg(0) ;s
|
||||
xor rcx, rcx
|
||||
|
||||
movsxd rdx, dword arg(3) ;cols
|
||||
add rdx, 8
|
||||
pxor mm0, mm0
|
||||
pxor mm1, mm1
|
||||
|
||||
pxor xmm0, xmm0
|
||||
.nextcol4:
|
||||
|
||||
movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
|
||||
movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
|
||||
|
||||
punpcklbw xmm1, xmm0 ; expanding
|
||||
punpcklbw xmm2, xmm0 ; expanding
|
||||
|
||||
punpcklwd xmm1, xmm0 ; expanding to dwords
|
||||
punpcklwd xmm2, xmm0 ; expanding to dwords
|
||||
|
||||
psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
|
||||
paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
|
||||
|
||||
paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
|
||||
pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
|
||||
|
||||
paddd xmm6, xmm2
|
||||
paddd xmm7, xmm1
|
||||
|
||||
pshufd xmm6, xmm6, 0 ; duplicate the last ones
|
||||
pshufd xmm7, xmm7, 0 ; duplicate the last ones
|
||||
|
||||
psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
|
||||
psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
|
||||
|
||||
pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
|
||||
pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
|
||||
|
||||
paddd xmm6, xmm4
|
||||
paddd xmm7, xmm3
|
||||
|
||||
pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
|
||||
pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
|
||||
|
||||
paddd xmm7, xmm3
|
||||
paddd xmm6, xmm4
|
||||
|
||||
pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
|
||||
pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
|
||||
|
||||
paddd xmm7, xmm3
|
||||
paddd xmm6, xmm4
|
||||
|
||||
movdqa xmm3, xmm6
|
||||
pmaddwd xmm3, xmm3
|
||||
|
||||
movdqa xmm5, xmm7
|
||||
pslld xmm5, 4
|
||||
|
||||
psubd xmm5, xmm7
|
||||
psubd xmm5, xmm3
|
||||
|
||||
psubd xmm5, flimit4
|
||||
psrad xmm5, 31
|
||||
|
||||
packssdw xmm5, xmm0
|
||||
packsswb xmm5, xmm0
|
||||
|
||||
movd xmm1, DWORD PTR [rsi+rcx]
|
||||
movq xmm2, xmm1
|
||||
|
||||
punpcklbw xmm1, xmm0
|
||||
punpcklwd xmm1, xmm0
|
||||
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm1, [GLOBAL(four8s)]
|
||||
|
||||
psrad xmm1, 4
|
||||
packssdw xmm1, xmm0
|
||||
|
||||
packuswb xmm1, xmm0
|
||||
pand xmm1, xmm5
|
||||
|
||||
pandn xmm5, xmm2
|
||||
por xmm5, xmm1
|
||||
|
||||
movd [rsi+rcx-8], mm0
|
||||
movq mm0, mm1
|
||||
|
||||
movdq2q mm1, xmm5
|
||||
psrldq xmm7, 12
|
||||
|
||||
psrldq xmm6, 12
|
||||
add rcx, 4
|
||||
|
||||
cmp rcx, rdx
|
||||
jl .nextcol4
|
||||
|
||||
;s+=pitch;
|
||||
movsxd rax, dword arg(1)
|
||||
add arg(0), rax
|
||||
|
||||
sub dword arg(2), 1 ;rows-=1
|
||||
cmp dword arg(2), 0
|
||||
jg .ip_row_loop
|
||||
|
||||
add rsp, 16
|
||||
pop rsp
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
%undef flimit4
|
||||
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
rd42:
|
||||
times 8 dw 0x04
|
||||
four8s:
|
||||
times 4 dd 8
|
@@ -3060,7 +3060,11 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
|
||||
l = 150;
|
||||
break;
|
||||
}
|
||||
vp9_denoise(cpi->Source, cpi->Source, l);
|
||||
if (!cpi->common.postproc_state.limits) {
|
||||
cpi->common.postproc_state.limits = vpx_calloc(
|
||||
cpi->common.width, sizeof(*cpi->common.postproc_state.limits));
|
||||
}
|
||||
vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits);
|
||||
}
|
||||
#endif // CONFIG_VP9_POSTPROC
|
||||
}
|
||||
@@ -4649,7 +4653,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
|
||||
}
|
||||
|
||||
vp9_deblock(cm->frame_to_show, pp,
|
||||
cm->lf.filter_level * 10 / 6);
|
||||
cm->lf.filter_level * 10 / 6, cm->postproc_state.limits);
|
||||
#endif
|
||||
vpx_clear_system_state();
|
||||
|
||||
|
@@ -67,7 +67,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
|
||||
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
|
||||
ifeq ($(CONFIG_VP9_POSTPROC),yes)
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
|
||||
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
|
||||
endif
|
||||
|
||||
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
|
@@ -992,6 +992,7 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
|
||||
return flags;
|
||||
}
|
||||
|
||||
const size_t kMinCompressedSize = 8192;
|
||||
static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
|
||||
const vpx_image_t *img,
|
||||
vpx_codec_pts_t pts,
|
||||
@@ -1013,8 +1014,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
|
||||
// instance for its status to determine the compressed data size.
|
||||
data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
|
||||
(cpi->multi_arf_allowed ? 8 : 2);
|
||||
if (data_sz < 4096)
|
||||
data_sz = 4096;
|
||||
if (data_sz < kMinCompressedSize)
|
||||
data_sz = kMinCompressedSize;
|
||||
if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
|
||||
ctx->cx_data_sz = data_sz;
|
||||
free(ctx->cx_data);
|
||||
|
@@ -101,7 +101,6 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
|
||||
@@ -109,13 +108,10 @@ VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
|
||||
else
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
|
||||
endif
|
||||
endif
|
||||
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
|
||||
|
@@ -397,13 +397,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
|
||||
si->width = enc_cfg->g_w;
|
||||
si->height = enc_cfg->g_h;
|
||||
|
||||
// wonkap: why is this necessary?
|
||||
/*if (enc_cfg->kf_max_dist < 2) {
|
||||
svc_log(svc_ctx, SVC_LOG_ERROR, "key frame distance too small: %d\n",
|
||||
enc_cfg->kf_max_dist);
|
||||
return VPX_CODEC_INVALID_PARAM;
|
||||
}*/
|
||||
|
||||
si->kf_dist = enc_cfg->kf_max_dist;
|
||||
|
||||
if (svc_ctx->spatial_layers == 0)
|
||||
|
@@ -8,6 +8,7 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "./vpx_config.h"
|
||||
@@ -23,11 +24,11 @@ void vpx_plane_add_noise_c(uint8_t *start, char *noise,
|
||||
unsigned int width, unsigned int height, int pitch) {
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0; i < height; i++) {
|
||||
for (i = 0; i < height; ++i) {
|
||||
uint8_t *pos = start + i * pitch;
|
||||
char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT
|
||||
|
||||
for (j = 0; j < width; j++) {
|
||||
for (j = 0; j < width; ++j) {
|
||||
int v = pos[j];
|
||||
|
||||
v = clamp(v - blackclamp[0], 0, 255);
|
||||
@@ -38,3 +39,36 @@ void vpx_plane_add_noise_c(uint8_t *start, char *noise,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static double gaussian(double sigma, double mu, double x) {
|
||||
return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
|
||||
(exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
|
||||
}
|
||||
|
||||
int vpx_setup_noise(double sigma, int size, char *noise) {
|
||||
char char_dist[256];
|
||||
int next = 0, i, j;
|
||||
|
||||
// set up a 256 entry lookup that matches gaussian distribution
|
||||
for (i = -32; i < 32; ++i) {
|
||||
const int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
|
||||
if (a_i) {
|
||||
for (j = 0; j < a_i; ++j) {
|
||||
char_dist[next + j] = (char)i;
|
||||
}
|
||||
next = next + j;
|
||||
}
|
||||
}
|
||||
|
||||
// Rounding error - might mean we have less than 256.
|
||||
for (; next < 256; ++next) {
|
||||
char_dist[next] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < size; ++i) {
|
||||
noise[i] = char_dist[rand() & 0xff]; // NOLINT
|
||||
}
|
||||
|
||||
// Returns the highest non 0 value used in distribution.
|
||||
return -char_dist[0];
|
||||
}
|
||||
|
203
vpx_dsp/deblock.c
Normal file
203
vpx_dsp/deblock.c
Normal file
@@ -0,0 +1,203 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
const int16_t vpx_rv[] = {8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3,
|
||||
14, 4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
|
||||
8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, 3, 3, 1, 13,
|
||||
13, 6, 6, 5, 2, 7, 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, 14, 4, 12, 5, 12, 10,
|
||||
8, 10, 13, 10, 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, 7, 14, 6, 14, 13, 2, 13, 5,
|
||||
4, 4, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, 3,
|
||||
4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13, 1, 12, 0,
|
||||
10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11, 7, 8, 7,
|
||||
5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
|
||||
11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, 0, 3,
|
||||
10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6, 10,
|
||||
8, 9, 4, 11, 14, 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, 8, 0, 4, 10, 7, 2, 7, 2,
|
||||
2, 5, 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, 13,
|
||||
1, 12, 0, 10, 9, 7, 6, 2, 8, 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, 9, 6, 10, 11,
|
||||
7, 8, 7, 5, 14, 8, 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, 3, 12, 5, 7, 14, 3, 14,
|
||||
5, 2, 6, 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, 5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
|
||||
0, 3, 10, 5, 8, 0, 11, 6, 7, 8, 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, 4, 3, 5, 6,
|
||||
10, 8, 9, 4, 11, 14, 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, 11, 9, 14, 8, 14, 13,
|
||||
4, 3, 1, 2, 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, 5, 8, 8, 12, 13, 5, 14, 10, 12,
|
||||
13, 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, };
|
||||
|
||||
void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line, int cols,
|
||||
unsigned char *f, int size) {
|
||||
unsigned char *p_src, *p_dst;
|
||||
int row;
|
||||
int col;
|
||||
unsigned char v;
|
||||
unsigned char d[4];
|
||||
|
||||
for (row = 0; row < size; row++) {
|
||||
/* post_proc_down for one row */
|
||||
p_src = src_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
for (col = 0; col < cols; col++) {
|
||||
unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
|
||||
unsigned char p_above1 = p_src[col - src_pixels_per_line];
|
||||
unsigned char p_below1 = p_src[col + src_pixels_per_line];
|
||||
unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
|
||||
|
||||
v = p_src[col];
|
||||
|
||||
if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
|
||||
&& (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_above2 + p_above1 + 1) >> 1;
|
||||
k2 = (p_below2 + p_below1 + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
p_dst[col] = v;
|
||||
}
|
||||
|
||||
/* now post_proc_across */
|
||||
p_src = dst_ptr;
|
||||
p_dst = dst_ptr;
|
||||
|
||||
p_src[-2] = p_src[-1] = p_src[0];
|
||||
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
|
||||
|
||||
for (col = 0; col < cols; col++) {
|
||||
v = p_src[col];
|
||||
|
||||
if ((abs(v - p_src[col - 2]) < f[col])
|
||||
&& (abs(v - p_src[col - 1]) < f[col])
|
||||
&& (abs(v - p_src[col + 1]) < f[col])
|
||||
&& (abs(v - p_src[col + 2]) < f[col])) {
|
||||
unsigned char k1, k2, k3;
|
||||
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
|
||||
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
|
||||
k3 = (k1 + k2 + 1) >> 1;
|
||||
v = (k3 + v + 1) >> 1;
|
||||
}
|
||||
|
||||
d[col & 3] = v;
|
||||
|
||||
if (col >= 2)
|
||||
p_dst[col - 2] = d[(col - 2) & 3];
|
||||
}
|
||||
|
||||
/* handle the last two pixels */
|
||||
p_dst[col - 2] = d[(col - 2) & 3];
|
||||
p_dst[col - 1] = d[(col - 1) & 3];
|
||||
|
||||
/* next row */
|
||||
src_ptr += src_pixels_per_line;
|
||||
dst_ptr += dst_pixels_per_line;
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
|
||||
int cols, int flimit) {
|
||||
int r, c, i;
|
||||
|
||||
unsigned char *s = src;
|
||||
unsigned char d[16];
|
||||
|
||||
for (r = 0; r < rows; r++) {
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
|
||||
for (i = -8; i < 0; i++)
|
||||
s[i] = s[0];
|
||||
|
||||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++)
|
||||
s[i + cols] = s[cols - 1];
|
||||
|
||||
for (i = -8; i <= 6; i++) {
|
||||
sumsq += s[i] * s[i];
|
||||
sum += s[i];
|
||||
d[i + 8] = 0;
|
||||
}
|
||||
|
||||
for (c = 0; c < cols + 8; c++) {
|
||||
int x = s[c + 7] - s[c - 8];
|
||||
int y = s[c + 7] + s[c - 8];
|
||||
|
||||
sum += x;
|
||||
sumsq += x * y;
|
||||
|
||||
d[c & 15] = s[c];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit) {
|
||||
d[c & 15] = (8 + sum + s[c]) >> 4;
|
||||
}
|
||||
|
||||
s[c - 8] = d[(c - 8) & 15];
|
||||
}
|
||||
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
|
||||
int flimit) {
|
||||
int r, c, i;
|
||||
const int16_t *rv3 = &vpx_rv[63 & rand()];
|
||||
|
||||
for (c = 0; c < cols; c++) {
|
||||
unsigned char *s = &dst[c];
|
||||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
unsigned char d[16];
|
||||
const int16_t *rv2 = rv3 + ((c * 17) & 127);
|
||||
|
||||
for (i = -8; i < 0; i++)
|
||||
s[i * pitch] = s[0];
|
||||
|
||||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++)
|
||||
s[(i + rows) * pitch] = s[(rows - 1) * pitch];
|
||||
|
||||
for (i = -8; i <= 6; i++) {
|
||||
sumsq += s[i * pitch] * s[i * pitch];
|
||||
sum += s[i * pitch];
|
||||
}
|
||||
|
||||
for (r = 0; r < rows + 8; r++) {
|
||||
sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
|
||||
sum += s[7 * pitch] - s[-8 * pitch];
|
||||
d[r & 15] = s[0];
|
||||
|
||||
if (sumsq * 15 - sum * sum < flimit) {
|
||||
d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
|
||||
}
|
||||
if (r >= 8)
|
||||
s[-8 * pitch] = d[(r - 8) & 15];
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_POSTPROC
|
||||
static void vpx_de_mblock(YV12_BUFFER_CONFIG *post,
|
||||
int q) {
|
||||
vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
|
||||
post->y_width, q2mbl(q));
|
||||
}
|
||||
|
||||
#endif
|
682
vpx_dsp/mips/deblock_msa.c
Normal file
682
vpx_dsp/mips/deblock_msa.c
Normal file
@@ -0,0 +1,682 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "./macros_msa.h"
|
||||
|
||||
extern const int16_t vpx_rv[];
|
||||
|
||||
#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3, \
|
||||
out4, out5, out6, out7, \
|
||||
out8, out9, out10, out11, \
|
||||
out12, out13, out14, out15) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3, temp4; \
|
||||
v8i16 temp5, temp6, temp7, temp8, temp9; \
|
||||
\
|
||||
ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
|
||||
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp8, temp9); \
|
||||
ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_UB(temp5, temp4, out8, out10); \
|
||||
ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \
|
||||
ILVRL_W2_UB(temp5, temp4, out12, out14); \
|
||||
out0 = (v16u8)temp6; \
|
||||
out2 = (v16u8)temp7; \
|
||||
out4 = (v16u8)temp8; \
|
||||
out6 = (v16u8)temp9; \
|
||||
out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \
|
||||
out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \
|
||||
out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \
|
||||
out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \
|
||||
out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
|
||||
out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
|
||||
out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \
|
||||
out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \
|
||||
}
|
||||
|
||||
#define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, \
|
||||
below1_in, below2_in, ref, out) \
|
||||
{ \
|
||||
v16u8 temp0, temp1; \
|
||||
\
|
||||
temp1 = __msa_aver_u_b(above2_in, above1_in); \
|
||||
temp0 = __msa_aver_u_b(below2_in, below1_in); \
|
||||
temp1 = __msa_aver_u_b(temp1, temp0); \
|
||||
out = __msa_aver_u_b(src_in, temp1); \
|
||||
temp0 = __msa_asub_u_b(src_in, above2_in); \
|
||||
temp1 = __msa_asub_u_b(src_in, above1_in); \
|
||||
temp0 = (temp0 < ref); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
temp1 = __msa_asub_u_b(src_in, below1_in); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
temp1 = __msa_asub_u_b(src_in, below2_in); \
|
||||
temp1 = (temp1 < ref); \
|
||||
temp0 = temp0 & temp1; \
|
||||
out = __msa_bmz_v(out, src_in, temp0); \
|
||||
}
|
||||
|
||||
#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3, temp4; \
|
||||
v8i16 temp5, temp6, temp7, temp8, temp9; \
|
||||
\
|
||||
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
|
||||
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
|
||||
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
|
||||
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
|
||||
ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5); \
|
||||
ILVRL_H2_SH(temp5, temp4, temp6, temp7); \
|
||||
ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5); \
|
||||
ILVRL_H2_SH(temp5, temp4, temp8, temp9); \
|
||||
ILVRL_W2_SH(temp8, temp6, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp9, temp7, temp6, temp7); \
|
||||
ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9); \
|
||||
ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2); \
|
||||
in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0); \
|
||||
in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1); \
|
||||
ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6); \
|
||||
in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2); \
|
||||
in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3); \
|
||||
ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, \
|
||||
temp2, temp3, temp4, temp5); \
|
||||
ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, \
|
||||
temp6, temp7, temp8, temp9); \
|
||||
ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1); \
|
||||
in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0); \
|
||||
in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0); \
|
||||
ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3); \
|
||||
in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2); \
|
||||
in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2); \
|
||||
}
|
||||
|
||||
#define VPX_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, \
|
||||
in6, in7, in8, in9, in10, in11) \
|
||||
{ \
|
||||
v8i16 temp0, temp1, temp2, temp3; \
|
||||
v8i16 temp4, temp5, temp6, temp7; \
|
||||
\
|
||||
ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp2, temp3); \
|
||||
ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1); \
|
||||
ILVRL_H2_SH(temp1, temp0, temp4, temp5); \
|
||||
ILVRL_W2_SH(temp4, temp2, temp0, temp1); \
|
||||
ILVRL_W2_SH(temp5, temp3, temp2, temp3); \
|
||||
ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5); \
|
||||
temp4 = __msa_ilvr_h(temp5, temp4); \
|
||||
ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7); \
|
||||
temp5 = __msa_ilvr_h(temp7, temp6); \
|
||||
ILVRL_W2_SH(temp5, temp4, temp6, temp7); \
|
||||
in0 = (v16u8)temp0; \
|
||||
in2 = (v16u8)temp1; \
|
||||
in4 = (v16u8)temp2; \
|
||||
in6 = (v16u8)temp3; \
|
||||
in8 = (v16u8)temp6; \
|
||||
in10 = (v16u8)temp7; \
|
||||
in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0); \
|
||||
in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1); \
|
||||
in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2); \
|
||||
in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3); \
|
||||
in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6); \
|
||||
in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7); \
|
||||
}
|
||||
|
||||
static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride, int32_t cols,
|
||||
uint8_t *f) {
|
||||
uint8_t *p_src = src_ptr;
|
||||
uint8_t *p_dst = dst_ptr;
|
||||
uint8_t *f_orig = f;
|
||||
uint8_t *p_dst_st = dst_ptr;
|
||||
uint16_t col;
|
||||
uint64_t out0, out1, out2, out3;
|
||||
v16u8 above2, above1, below2, below1, src, ref, ref_temp;
|
||||
v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
|
||||
v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
|
||||
|
||||
for (col = (cols / 16); col--;) {
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
p_dst, dst_stride);
|
||||
|
||||
p_dst += 16;
|
||||
p_src += 16;
|
||||
f += 16;
|
||||
}
|
||||
|
||||
if (0 != (cols / 16)) {
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
out0 = __msa_copy_u_d((v2i64) inter0, 0);
|
||||
out1 = __msa_copy_u_d((v2i64) inter1, 0);
|
||||
out2 = __msa_copy_u_d((v2i64) inter2, 0);
|
||||
out3 = __msa_copy_u_d((v2i64) inter3, 0);
|
||||
SD4(out0, out1, out2, out3, p_dst, dst_stride);
|
||||
|
||||
out0 = __msa_copy_u_d((v2i64) inter4, 0);
|
||||
out1 = __msa_copy_u_d((v2i64) inter5, 0);
|
||||
out2 = __msa_copy_u_d((v2i64) inter6, 0);
|
||||
out3 = __msa_copy_u_d((v2i64) inter7, 0);
|
||||
SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
|
||||
}
|
||||
|
||||
f = f_orig;
|
||||
p_dst = dst_ptr - 2;
|
||||
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7);
|
||||
|
||||
for (col = 0; col < (cols / 8); ++col) {
|
||||
ref = LD_UB(f);
|
||||
f += 8;
|
||||
VPX_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9, inter10, inter11);
|
||||
if (0 == col) {
|
||||
above2 = inter2;
|
||||
above1 = inter2;
|
||||
} else {
|
||||
above2 = inter0;
|
||||
above1 = inter1;
|
||||
}
|
||||
src = inter2;
|
||||
below1 = inter3;
|
||||
below2 = inter4;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
|
||||
above2 = inter5;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
|
||||
above1 = inter6;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
|
||||
src = inter7;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
|
||||
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
|
||||
below1 = inter8;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
|
||||
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
|
||||
below2 = inter9;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above2 = inter9;
|
||||
} else {
|
||||
above2 = inter10;
|
||||
}
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above1 = inter9;
|
||||
} else {
|
||||
above1 = inter11;
|
||||
}
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
|
||||
TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
|
||||
inter9, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
inter8, inter9);
|
||||
p_dst += 8;
|
||||
LD_UB2(p_dst, dst_stride, inter0, inter1);
|
||||
ST8x1_UB(inter2, p_dst_st);
|
||||
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
|
||||
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
|
||||
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
|
||||
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
|
||||
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
|
||||
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
|
||||
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
|
||||
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
|
||||
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
|
||||
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
|
||||
p_dst_st += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride, int32_t cols,
|
||||
uint8_t *f) {
|
||||
uint8_t *p_src = src_ptr;
|
||||
uint8_t *p_dst = dst_ptr;
|
||||
uint8_t *p_dst_st = dst_ptr;
|
||||
uint8_t *f_orig = f;
|
||||
uint16_t col;
|
||||
v16u8 above2, above1, below2, below1;
|
||||
v16u8 src, ref, ref_temp;
|
||||
v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
|
||||
v16u8 inter7, inter8, inter9, inter10, inter11;
|
||||
v16u8 inter12, inter13, inter14, inter15;
|
||||
|
||||
for (col = (cols / 16); col--;) {
|
||||
ref = LD_UB(f);
|
||||
LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
|
||||
src = LD_UB(p_src);
|
||||
LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
|
||||
above2 = LD_UB(p_src + 3 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
|
||||
above1 = LD_UB(p_src + 4 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
|
||||
src = LD_UB(p_src + 5 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
|
||||
below1 = LD_UB(p_src + 6 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
|
||||
below2 = LD_UB(p_src + 7 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
|
||||
above2 = LD_UB(p_src + 8 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
|
||||
above1 = LD_UB(p_src + 9 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
|
||||
src = LD_UB(p_src + 10 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
|
||||
below1 = LD_UB(p_src + 11 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
|
||||
below2 = LD_UB(p_src + 12 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
|
||||
above2 = LD_UB(p_src + 13 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
|
||||
above1 = LD_UB(p_src + 14 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
|
||||
src = LD_UB(p_src + 15 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
|
||||
below1 = LD_UB(p_src + 16 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
|
||||
below2 = LD_UB(p_src + 17 * src_stride);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
|
||||
ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
p_dst, dst_stride);
|
||||
ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
|
||||
p_dst + 8 * dst_stride, dst_stride);
|
||||
p_src += 16;
|
||||
p_dst += 16;
|
||||
f += 16;
|
||||
}
|
||||
|
||||
f = f_orig;
|
||||
p_dst = dst_ptr - 2;
|
||||
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7);
|
||||
LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
|
||||
inter12, inter13, inter14, inter15);
|
||||
|
||||
for (col = 0; col < cols / 8; ++col) {
|
||||
ref = LD_UB(f);
|
||||
f += 8;
|
||||
TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
|
||||
inter7, inter8, inter9, inter10, inter11, inter12, inter13,
|
||||
inter14, inter15);
|
||||
if (0 == col) {
|
||||
above2 = inter2;
|
||||
above1 = inter2;
|
||||
} else {
|
||||
above2 = inter0;
|
||||
above1 = inter1;
|
||||
}
|
||||
|
||||
src = inter2;
|
||||
below1 = inter3;
|
||||
below2 = inter4;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 0);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
|
||||
above2 = inter5;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 1);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
|
||||
above1 = inter6;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 2);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
|
||||
src = inter7;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 3);
|
||||
VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
|
||||
below1 = inter8;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 4);
|
||||
VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
|
||||
below2 = inter9;
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 5);
|
||||
VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above2 = inter9;
|
||||
} else {
|
||||
above2 = inter10;
|
||||
}
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 6);
|
||||
VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
|
||||
if (col == (cols / 8 - 1)) {
|
||||
above1 = inter9;
|
||||
} else {
|
||||
above1 = inter11;
|
||||
}
|
||||
ref_temp = (v16u8) __msa_splati_b((v16i8) ref, 7);
|
||||
VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
|
||||
VPX_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
|
||||
inter8, inter9, inter2, inter3, inter4, inter5,
|
||||
inter6, inter7, inter8, inter9, inter10, inter11,
|
||||
inter12, inter13, inter14, inter15, above2, above1);
|
||||
|
||||
p_dst += 8;
|
||||
LD_UB2(p_dst, dst_stride, inter0, inter1);
|
||||
ST8x1_UB(inter2, p_dst_st);
|
||||
ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
|
||||
LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
|
||||
ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
|
||||
ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
|
||||
LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
|
||||
ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
|
||||
ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
|
||||
LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
|
||||
ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
|
||||
ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
|
||||
LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
|
||||
ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
|
||||
ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
|
||||
LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
|
||||
ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
|
||||
ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
|
||||
LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
|
||||
ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
|
||||
ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
|
||||
LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
|
||||
ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
|
||||
ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
|
||||
p_dst_st += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
|
||||
int32_t src_stride,
|
||||
int32_t dst_stride, int32_t cols,
|
||||
uint8_t *f, int32_t size) {
|
||||
if (8 == size) {
|
||||
postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
|
||||
} else if (16 == size) {
|
||||
postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
|
||||
int32_t rows, int32_t cols, int32_t flimit) {
|
||||
int32_t row, col, cnt;
|
||||
uint8_t *src_dup = src_ptr;
|
||||
v16u8 src0, src, tmp_orig;
|
||||
v16u8 tmp = {0};
|
||||
v16i8 zero = {0};
|
||||
v8u16 sum_h, src_r_h, src_l_h;
|
||||
v4u32 src_r_w, src_l_w;
|
||||
v4i32 flimit_vec;
|
||||
|
||||
flimit_vec = __msa_fill_w(flimit);
|
||||
for (row = rows; row--;) {
|
||||
int32_t sum_sq = 0;
|
||||
int32_t sum = 0;
|
||||
src0 = (v16u8) __msa_fill_b(src_dup[0]);
|
||||
ST8x1_UB(src0, (src_dup - 8));
|
||||
|
||||
src0 = (v16u8) __msa_fill_b(src_dup[cols - 1]);
|
||||
ST_UB(src0, src_dup + cols);
|
||||
src_dup[cols + 16] = src_dup[cols - 1];
|
||||
tmp_orig = (v16u8) __msa_ldi_b(0);
|
||||
tmp_orig[15] = tmp[15];
|
||||
src = LD_UB(src_dup - 8);
|
||||
src[15] = 0;
|
||||
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
|
||||
src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
|
||||
src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
|
||||
sum_sq = HADD_SW_S32(src_r_w);
|
||||
sum_sq += HADD_SW_S32(src_l_w);
|
||||
sum_h = __msa_hadd_u_h(src, src);
|
||||
sum = HADD_UH_U32(sum_h);
|
||||
{
|
||||
v16u8 src7, src8, src_r, src_l;
|
||||
v16i8 mask;
|
||||
v8u16 add_r, add_l;
|
||||
v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
|
||||
v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
|
||||
v4i32 sub0, sub1, sub2, sub3;
|
||||
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
|
||||
v4i32 mul0, mul1, mul2, mul3;
|
||||
v4i32 total0, total1, total2, total3;
|
||||
v8i16 const8 = __msa_fill_h(8);
|
||||
|
||||
src7 = LD_UB(src_dup + 7);
|
||||
src8 = LD_UB(src_dup - 8);
|
||||
for (col = 0; col < (cols >> 4); ++col) {
|
||||
ILVRL_B2_UB(src7, src8, src_r, src_l);
|
||||
HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
|
||||
|
||||
sum_r[0] = sum + sub_r[0];
|
||||
for (cnt = 0; cnt < 7; ++cnt) {
|
||||
sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
|
||||
}
|
||||
sum_l[0] = sum_r[7] + sub_l[0];
|
||||
for (cnt = 0; cnt < 7; ++cnt) {
|
||||
sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
|
||||
}
|
||||
sum = sum_l[7];
|
||||
src = LD_UB(src_dup + 16 * col);
|
||||
ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
|
||||
src7 = (v16u8)((const8 + sum_r + (v8i16) src_r_h) >> 4);
|
||||
src8 = (v16u8)((const8 + sum_l + (v8i16) src_l_h) >> 4);
|
||||
tmp = (v16u8) __msa_pckev_b((v16i8) src8, (v16i8) src7);
|
||||
|
||||
HADD_UB2_UH(src_r, src_l, add_r, add_l);
|
||||
UNPCK_SH_SW(sub_r, sub0, sub1);
|
||||
UNPCK_SH_SW(sub_l, sub2, sub3);
|
||||
ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
|
||||
ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
|
||||
MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
|
||||
mul2, mul3);
|
||||
sum_sq0[0] = sum_sq + mul0[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
|
||||
}
|
||||
sum_sq1[0] = sum_sq0[3] + mul1[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
|
||||
}
|
||||
sum_sq2[0] = sum_sq1[3] + mul2[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
|
||||
}
|
||||
sum_sq3[0] = sum_sq2[3] + mul3[0];
|
||||
for (cnt = 0; cnt < 3; ++cnt) {
|
||||
sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
|
||||
}
|
||||
sum_sq = sum_sq3[3];
|
||||
|
||||
UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
|
||||
UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
|
||||
total0 = sum_sq0 * __msa_ldi_w(15);
|
||||
total0 -= sum0_w * sum0_w;
|
||||
total1 = sum_sq1 * __msa_ldi_w(15);
|
||||
total1 -= sum1_w * sum1_w;
|
||||
total2 = sum_sq2 * __msa_ldi_w(15);
|
||||
total2 -= sum2_w * sum2_w;
|
||||
total3 = sum_sq3 * __msa_ldi_w(15);
|
||||
total3 -= sum3_w * sum3_w;
|
||||
total0 = (total0 < flimit_vec);
|
||||
total1 = (total1 < flimit_vec);
|
||||
total2 = (total2 < flimit_vec);
|
||||
total3 = (total3 < flimit_vec);
|
||||
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
|
||||
mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
|
||||
tmp = __msa_bmz_v(tmp, src, (v16u8) mask);
|
||||
|
||||
if (col == 0) {
|
||||
uint64_t src_d;
|
||||
|
||||
src_d = __msa_copy_u_d((v2i64) tmp_orig, 1);
|
||||
SD(src_d, (src_dup - 8));
|
||||
}
|
||||
|
||||
src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
|
||||
src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
|
||||
ST_UB(tmp, (src_dup + (16 * col)));
|
||||
}
|
||||
|
||||
src_dup += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
|
||||
int32_t cols, int32_t flimit) {
|
||||
int32_t row, col, cnt, i;
|
||||
const int16_t *rv3 = &vpx_rv[63 & rand()];
|
||||
v4i32 flimit_vec;
|
||||
v16u8 dst7, dst8, dst_r_b, dst_l_b;
|
||||
v16i8 mask;
|
||||
v8u16 add_r, add_l;
|
||||
v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
|
||||
v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
|
||||
|
||||
flimit_vec = __msa_fill_w(flimit);
|
||||
|
||||
for (col = 0; col < (cols >> 4); ++col) {
|
||||
uint8_t *dst_tmp = &dst_ptr[col << 4];
|
||||
v16u8 dst;
|
||||
v16i8 zero = {0};
|
||||
v16u8 tmp[16];
|
||||
v8i16 mult0, mult1, rv2_0, rv2_1;
|
||||
v8i16 sum0_h = {0};
|
||||
v8i16 sum1_h = {0};
|
||||
v4i32 mul0 = {0};
|
||||
v4i32 mul1 = {0};
|
||||
v4i32 mul2 = {0};
|
||||
v4i32 mul3 = {0};
|
||||
v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
|
||||
v4i32 add0, add1, add2, add3;
|
||||
const int16_t *rv2[16];
|
||||
|
||||
dst = LD_UB(dst_tmp);
|
||||
for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
|
||||
rv2[i] = rv3 + ((cnt * 17) & 127);
|
||||
++i;
|
||||
}
|
||||
for (cnt = -8; cnt < 0; ++cnt) {
|
||||
ST_UB(dst, dst_tmp + cnt * pitch);
|
||||
}
|
||||
|
||||
dst = LD_UB((dst_tmp + (rows - 1) * pitch));
|
||||
for (cnt = rows; cnt < rows + 17; ++cnt) {
|
||||
ST_UB(dst, dst_tmp + cnt * pitch);
|
||||
}
|
||||
for (cnt = -8; cnt <= 6; ++cnt) {
|
||||
dst = LD_UB(dst_tmp + (cnt * pitch));
|
||||
UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
|
||||
MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
|
||||
mul0 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult0);
|
||||
mul1 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult0);
|
||||
mul2 += (v4i32) __msa_ilvr_h((v8i16) zero, (v8i16) mult1);
|
||||
mul3 += (v4i32) __msa_ilvl_h((v8i16) zero, (v8i16) mult1);
|
||||
ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
|
||||
}
|
||||
|
||||
for (row = 0; row < (rows + 8); ++row) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
rv2_0[i] = *(rv2[i] + (row & 127));
|
||||
rv2_1[i] = *(rv2[i + 8] + (row & 127));
|
||||
}
|
||||
dst7 = LD_UB(dst_tmp + (7 * pitch));
|
||||
dst8 = LD_UB(dst_tmp - (8 * pitch));
|
||||
ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
|
||||
|
||||
HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
|
||||
UNPCK_SH_SW(sub_r, sub0, sub1);
|
||||
UNPCK_SH_SW(sub_l, sub2, sub3);
|
||||
sum0_h += sub_r;
|
||||
sum1_h += sub_l;
|
||||
|
||||
HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
|
||||
|
||||
ILVRL_H2_SW(zero, add_r, add0, add1);
|
||||
ILVRL_H2_SW(zero, add_l, add2, add3);
|
||||
mul0 += add0 * sub0;
|
||||
mul1 += add1 * sub1;
|
||||
mul2 += add2 * sub2;
|
||||
mul3 += add3 * sub3;
|
||||
dst = LD_UB(dst_tmp);
|
||||
ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
|
||||
dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
|
||||
dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
|
||||
tmp[row & 15] = (v16u8) __msa_pckev_b((v16i8) dst8, (v16i8) dst7);
|
||||
|
||||
UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
|
||||
UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
|
||||
total0 = mul0 * __msa_ldi_w(15);
|
||||
total0 -= sum0_w * sum0_w;
|
||||
total1 = mul1 * __msa_ldi_w(15);
|
||||
total1 -= sum1_w * sum1_w;
|
||||
total2 = mul2 * __msa_ldi_w(15);
|
||||
total2 -= sum2_w * sum2_w;
|
||||
total3 = mul3 * __msa_ldi_w(15);
|
||||
total3 -= sum3_w * sum3_w;
|
||||
total0 = (total0 < flimit_vec);
|
||||
total1 = (total1 < flimit_vec);
|
||||
total2 = (total2 < flimit_vec);
|
||||
total3 = (total3 < flimit_vec);
|
||||
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
|
||||
mask = __msa_pckev_b((v16i8) mask1, (v16i8) mask0);
|
||||
tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8) mask);
|
||||
|
||||
if (row >= 8) {
|
||||
ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
|
||||
}
|
||||
|
||||
dst_tmp += pitch;
|
||||
}
|
||||
}
|
||||
}
|
@@ -1060,6 +1060,7 @@
|
||||
ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
|
||||
#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
|
||||
#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave left half of halfword elements from vectors
|
||||
@@ -1074,6 +1075,7 @@
|
||||
out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
|
||||
}
|
||||
#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
|
||||
#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave left half of word elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
@@ -1137,6 +1139,7 @@
|
||||
out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
|
||||
}
|
||||
#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
|
||||
#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
|
||||
|
||||
#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3) { \
|
||||
@@ -1215,6 +1218,7 @@
|
||||
out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
|
||||
out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
|
||||
}
|
||||
#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
|
||||
#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
|
||||
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
|
||||
|
||||
|
25
vpx_dsp/postproc.h
Normal file
25
vpx_dsp/postproc.h
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VPX_DSP_POSTPROC_H_
|
||||
#define VPX_DSP_POSTPROC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Fills a noise buffer with gaussian noise strength determined by sigma.
|
||||
int vpx_setup_noise(double sigma, int size, char *noise);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // VPX_DSP_POSTPROC_H_
|
@@ -51,7 +51,7 @@ static void encoder_variance(const uint8_t *a, int a_stride,
|
||||
static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride,
|
||||
int w, int h, uint64_t *sse,
|
||||
uint64_t *sum) {
|
||||
int64_t *sum) {
|
||||
int i, j;
|
||||
|
||||
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
@@ -75,7 +75,7 @@ static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
|
||||
int w, int h,
|
||||
unsigned int *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
uint64_t sum_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
|
||||
&sse_long, &sum_long);
|
||||
*sse = (unsigned int)sse_long;
|
||||
|
@@ -245,6 +245,8 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
|
||||
}
|
||||
}
|
||||
}
|
||||
if (pixels <=0)
|
||||
return 0;
|
||||
ret /= pixels;
|
||||
return ret;
|
||||
}
|
||||
|
@@ -42,24 +42,24 @@ endif
|
||||
# intra predictions
|
||||
DSP_SRCS-yes += intrapred.c
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
|
||||
DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
|
||||
endif # CONFIG_USE_X86INC
|
||||
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
|
||||
endif # CONFIG_USE_X86INC
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
|
||||
DSP_SRCS-yes += add_noise.c
|
||||
DSP_SRCS-yes += deblock.c
|
||||
DSP_SRCS-yes += postproc.h
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
|
||||
endif # CONFIG_POSTPROC
|
||||
|
||||
DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
|
||||
@@ -102,9 +102,8 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm
|
||||
endif
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
|
||||
endif
|
||||
|
||||
ifeq ($(HAVE_NEON_ASM),yes)
|
||||
DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
|
||||
@@ -194,10 +193,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm
|
||||
endif
|
||||
endif
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
|
||||
@@ -212,12 +209,10 @@ DSP_SRCS-yes += inv_txfm.h
|
||||
DSP_SRCS-yes += inv_txfm.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm
|
||||
endif # ARCH_X86_64
|
||||
endif # CONFIG_USE_X86INC
|
||||
|
||||
ifeq ($(HAVE_NEON_ASM),yes)
|
||||
DSP_SRCS-yes += arm/save_reg_neon$(ASM)
|
||||
@@ -269,11 +264,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
|
||||
endif
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
|
||||
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
|
||||
endif
|
||||
endif
|
||||
|
||||
# avg
|
||||
DSP_SRCS-yes += avg.c
|
||||
@@ -282,10 +275,8 @@ DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
|
||||
ifeq ($(ARCH_X86_64),yes)
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
|
||||
endif
|
||||
endif
|
||||
|
||||
# high bit depth subtract
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
@@ -329,7 +320,6 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
|
||||
endif #CONFIG_OBMC
|
||||
endif #CONFIG_VP10_ENCODER
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
|
||||
@@ -340,7 +330,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
endif # CONFIG_USE_X86INC
|
||||
|
||||
endif # CONFIG_ENCODERS
|
||||
|
||||
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
|
||||
@@ -370,18 +360,14 @@ ifeq ($(ARCH_X86_64),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm
|
||||
endif # ARCH_X86_64
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3
|
||||
endif # CONFIG_USE_X86INC
|
||||
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
|
||||
endif # CONFIG_USE_X86INC
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -83,7 +83,7 @@
|
||||
add rbx, 16
|
||||
%endmacro
|
||||
|
||||
;void vp8_post_proc_down_and_across_mb_row_sse2
|
||||
;void vpx_post_proc_down_and_across_mb_row_sse2
|
||||
;(
|
||||
; unsigned char *src_ptr,
|
||||
; unsigned char *dst_ptr,
|
||||
@@ -93,8 +93,8 @@
|
||||
; int *flimits,
|
||||
; int size
|
||||
;)
|
||||
global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
|
||||
sym(vp8_post_proc_down_and_across_mb_row_sse2):
|
||||
global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
|
||||
sym(vpx_post_proc_down_and_across_mb_row_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
@@ -198,7 +198,7 @@ sym(vp8_post_proc_down_and_across_mb_row_sse2):
|
||||
UPDATE_FLIMIT
|
||||
jmp .acrossnextcol
|
||||
|
||||
.acrossdone
|
||||
.acrossdone:
|
||||
; last 16 pixels
|
||||
movq QWORD PTR [rdi+rdx-16], mm0
|
||||
|
||||
@@ -230,11 +230,11 @@ sym(vp8_post_proc_down_and_across_mb_row_sse2):
|
||||
ret
|
||||
%undef flimit
|
||||
|
||||
;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
|
||||
;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
extern sym(vp8_rv)
|
||||
global sym(vp8_mbpost_proc_down_xmm) PRIVATE
|
||||
sym(vp8_mbpost_proc_down_xmm):
|
||||
extern sym(vpx_rv)
|
||||
global sym(vpx_mbpost_proc_down_xmm) PRIVATE
|
||||
sym(vpx_mbpost_proc_down_xmm):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
@@ -257,7 +257,7 @@ sym(vp8_mbpost_proc_down_xmm):
|
||||
%define flimit4 [rsp+128]
|
||||
|
||||
%if ABI_IS_32BIT=0
|
||||
lea r8, [GLOBAL(sym(vp8_rv))]
|
||||
lea r8, [GLOBAL(sym(vpx_rv))]
|
||||
%endif
|
||||
|
||||
;rows +=8;
|
||||
@@ -278,7 +278,7 @@ sym(vp8_mbpost_proc_down_xmm):
|
||||
lea rdi, [rdi+rdx]
|
||||
movq xmm1, QWORD ptr[rdi] ; first row
|
||||
mov rcx, 8
|
||||
.init_borderd ; initialize borders
|
||||
.init_borderd: ; initialize borders
|
||||
lea rdi, [rdi + rax]
|
||||
movq [rdi], xmm1
|
||||
|
||||
@@ -291,7 +291,7 @@ sym(vp8_mbpost_proc_down_xmm):
|
||||
mov rdi, rsi
|
||||
movq xmm1, QWORD ptr[rdi] ; first row
|
||||
mov rcx, 8
|
||||
.init_border ; initialize borders
|
||||
.init_border: ; initialize borders
|
||||
lea rdi, [rdi + rax]
|
||||
movq [rdi], xmm1
|
||||
|
||||
@@ -403,13 +403,13 @@ sym(vp8_mbpost_proc_down_xmm):
|
||||
and rcx, 127
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
push rax
|
||||
lea rax, [GLOBAL(sym(vp8_rv))]
|
||||
movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
|
||||
lea rax, [GLOBAL(sym(vpx_rv))]
|
||||
movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2]
|
||||
pop rax
|
||||
%elif ABI_IS_32BIT=0
|
||||
movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
|
||||
movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2]
|
||||
%else
|
||||
movdqu xmm4, [sym(vp8_rv) + rcx*2]
|
||||
movdqu xmm4, [sym(vpx_rv) + rcx*2]
|
||||
%endif
|
||||
|
||||
paddw xmm1, xmm4
|
||||
@@ -434,7 +434,7 @@ sym(vp8_mbpost_proc_down_xmm):
|
||||
movq mm0, [rsp + rcx*8] ;d[rcx*8]
|
||||
movq [rsi], mm0
|
||||
|
||||
.skip_assignment
|
||||
.skip_assignment:
|
||||
lea rsi, [rsi+rax]
|
||||
|
||||
lea rdi, [rdi+rax]
|
||||
@@ -462,10 +462,10 @@ sym(vp8_mbpost_proc_down_xmm):
|
||||
%undef flimit4
|
||||
|
||||
|
||||
;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
|
||||
;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
|
||||
; int pitch, int rows, int cols,int flimit)
|
||||
global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
|
||||
sym(vp8_mbpost_proc_across_ip_xmm):
|
||||
global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
|
||||
sym(vpx_mbpost_proc_across_ip_xmm):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
@@ -147,24 +147,28 @@ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
int64_t var; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_10_variance_sse2( \
|
||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
|
||||
return (var >= 0) ? (uint32_t)var : 0; \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
|
||||
const uint8_t *src8, int src_stride, \
|
||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||
int sum; \
|
||||
int64_t var; \
|
||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||
highbd_12_variance_sse2( \
|
||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||
var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
|
||||
return (var >= 0) ? (uint32_t)var : 0; \
|
||||
}
|
||||
|
||||
VAR_FN(64, 64, 16, 12);
|
||||
@@ -246,7 +250,6 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||
return *sse;
|
||||
}
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
// The 2 unused parameters are place holders for PIC enabled build.
|
||||
// These definitions are for functions defined in
|
||||
// highbd_subpel_variance_impl_sse2.asm
|
||||
@@ -593,7 +596,6 @@ FNS(sse2);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
#endif // CONFIG_USE_X86INC
|
||||
|
||||
void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred,
|
||||
int width, int height,
|
||||
|
@@ -756,7 +756,7 @@ cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
|
||||
psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
|
||||
movq m2, [leftq]
|
||||
punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
|
||||
.loop
|
||||
.loop:
|
||||
pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
|
||||
pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
|
||||
punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
|
||||
|
@@ -111,7 +111,6 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
|
||||
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
|
||||
__m128i mask, hev;
|
||||
|
||||
p3p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
|
||||
p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
|
||||
_mm_loadl_epi64((__m128i *)(s - 4 * p)));
|
||||
q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
|
||||
|
@@ -308,7 +308,6 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
|
||||
return *sse;
|
||||
}
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
// The 2 unused parameters are place holders for PIC enabled build.
|
||||
// These definitions are for functions defined in subpel_variance.asm
|
||||
#define DECL(w, opt) \
|
||||
@@ -474,7 +473,6 @@ FNS(ssse3, ssse3);
|
||||
|
||||
#undef FNS
|
||||
#undef FN
|
||||
#endif // CONFIG_USE_X86INC
|
||||
|
||||
void vpx_upsampled_pred_sse2(uint8_t *comp_pred,
|
||||
int width, int height,
|
||||
|
@@ -201,7 +201,7 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
|
||||
%endif
|
||||
%endif ; CONFIG_VP10 && CONFIG_EXT_PARTITION
|
||||
|
||||
.w64
|
||||
.w64:
|
||||
mov r4d, dword hm
|
||||
.loop64:
|
||||
movu m0, [srcq]
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -14,14 +14,14 @@
|
||||
mov rdx, arg(5) ;filter ptr
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(2) ;output_ptr
|
||||
mov rcx, 0x0400040
|
||||
mov ecx, 0x01000100
|
||||
|
||||
movdqa xmm3, [rdx] ;load filters
|
||||
psrldq xmm3, 6
|
||||
packsswb xmm3, xmm3
|
||||
pshuflw xmm3, xmm3, 0b ;k3_k4
|
||||
|
||||
movq xmm2, rcx ;rounding
|
||||
movd xmm2, ecx ;rounding_shift
|
||||
pshufd xmm2, xmm2, 0
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -33,8 +33,7 @@
|
||||
punpcklbw xmm0, xmm1
|
||||
pmaddubsw xmm0, xmm3
|
||||
|
||||
paddsw xmm0, xmm2 ;rounding
|
||||
psraw xmm0, 7 ;shift
|
||||
pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
|
||||
packuswb xmm0, xmm0 ;pack to byte
|
||||
|
||||
%if %1
|
||||
@@ -51,7 +50,7 @@
|
||||
mov rdx, arg(5) ;filter ptr
|
||||
mov rsi, arg(0) ;src_ptr
|
||||
mov rdi, arg(2) ;output_ptr
|
||||
mov rcx, 0x0400040
|
||||
mov ecx, 0x01000100
|
||||
|
||||
movdqa xmm7, [rdx] ;load filters
|
||||
psrldq xmm7, 6
|
||||
@@ -59,7 +58,7 @@
|
||||
pshuflw xmm7, xmm7, 0b ;k3_k4
|
||||
punpcklwd xmm7, xmm7
|
||||
|
||||
movq xmm6, rcx ;rounding
|
||||
movd xmm6, ecx ;rounding_shift
|
||||
pshufd xmm6, xmm6, 0
|
||||
|
||||
movsxd rax, DWORD PTR arg(1) ;pixels_per_line
|
||||
@@ -71,8 +70,7 @@
|
||||
punpcklbw xmm0, xmm1
|
||||
pmaddubsw xmm0, xmm7
|
||||
|
||||
paddsw xmm0, xmm6 ;rounding
|
||||
psraw xmm0, 7 ;shift
|
||||
pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
|
||||
packuswb xmm0, xmm0 ;pack back to byte
|
||||
|
||||
%if %1
|
||||
@@ -92,10 +90,8 @@
|
||||
pmaddubsw xmm0, xmm7
|
||||
pmaddubsw xmm2, xmm7
|
||||
|
||||
paddsw xmm0, xmm6 ;rounding
|
||||
paddsw xmm2, xmm6
|
||||
psraw xmm0, 7 ;shift
|
||||
psraw xmm2, 7
|
||||
pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
|
||||
pmulhrsw xmm2, xmm6
|
||||
packuswb xmm0, xmm2 ;pack back to byte
|
||||
|
||||
%if %1
|
||||
|
@@ -10,8 +10,7 @@
|
||||
// Multi-threaded worker
|
||||
//
|
||||
// Original source:
|
||||
// http://git.chromium.org/webm/libwebp.git
|
||||
// 100644 blob 264210ba2807e4da47eb5d18c04cf869d89b9784 src/utils/thread.c
|
||||
// https://chromium.googlesource.com/webm/libwebp
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h> // for memset()
|
||||
|
@@ -10,8 +10,7 @@
|
||||
// Multi-threaded worker
|
||||
//
|
||||
// Original source:
|
||||
// http://git.chromium.org/webm/libwebp.git
|
||||
// 100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38 src/utils/thread.h
|
||||
// https://chromium.googlesource.com/webm/libwebp
|
||||
|
||||
#ifndef VPX_THREAD_H_
|
||||
#define VPX_THREAD_H_
|
||||
@@ -34,11 +33,26 @@ extern "C" {
|
||||
#include <windows.h> // NOLINT
|
||||
typedef HANDLE pthread_t;
|
||||
typedef CRITICAL_SECTION pthread_mutex_t;
|
||||
|
||||
#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
|
||||
#define USE_WINDOWS_CONDITION_VARIABLE
|
||||
typedef CONDITION_VARIABLE pthread_cond_t;
|
||||
#else
|
||||
typedef struct {
|
||||
HANDLE waiting_sem_;
|
||||
HANDLE received_sem_;
|
||||
HANDLE signal_event_;
|
||||
} pthread_cond_t;
|
||||
#endif // _WIN32_WINNT >= 0x600
|
||||
|
||||
#ifndef WINAPI_FAMILY_PARTITION
|
||||
#define WINAPI_PARTITION_DESKTOP 1
|
||||
#define WINAPI_FAMILY_PARTITION(x) x
|
||||
#endif
|
||||
|
||||
#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
|
||||
#define USE_CREATE_THREAD
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// simplistic pthread emulation layer
|
||||
@@ -47,16 +61,30 @@ typedef struct {
|
||||
#define THREADFN unsigned int __stdcall
|
||||
#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
|
||||
|
||||
#if _WIN32_WINNT >= 0x0501 // Windows XP or greater
|
||||
#define WaitForSingleObject(obj, timeout) \
|
||||
WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
|
||||
#endif
|
||||
|
||||
static INLINE int pthread_create(pthread_t* const thread, const void* attr,
|
||||
unsigned int (__stdcall *start)(void*),
|
||||
void* arg) {
|
||||
(void)attr;
|
||||
#ifdef USE_CREATE_THREAD
|
||||
*thread = CreateThread(NULL, /* lpThreadAttributes */
|
||||
0, /* dwStackSize */
|
||||
start,
|
||||
arg,
|
||||
0, /* dwStackSize */
|
||||
NULL); /* lpThreadId */
|
||||
#else
|
||||
*thread = (pthread_t)_beginthreadex(NULL, /* void *security */
|
||||
0, /* unsigned stack_size */
|
||||
start,
|
||||
arg,
|
||||
0, /* unsigned initflag */
|
||||
NULL); /* unsigned *thrdaddr */
|
||||
#endif
|
||||
if (*thread == NULL) return 1;
|
||||
SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
|
||||
return 0;
|
||||
@@ -72,7 +100,11 @@ static INLINE int pthread_join(pthread_t thread, void** value_ptr) {
|
||||
static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
|
||||
void* mutexattr) {
|
||||
(void)mutexattr;
|
||||
#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater
|
||||
InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
|
||||
#else
|
||||
InitializeCriticalSection(mutex);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -98,15 +130,22 @@ static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
|
||||
// Condition
|
||||
static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
|
||||
int ok = 1;
|
||||
#ifdef USE_WINDOWS_CONDITION_VARIABLE
|
||||
(void)condition;
|
||||
#else
|
||||
ok &= (CloseHandle(condition->waiting_sem_) != 0);
|
||||
ok &= (CloseHandle(condition->received_sem_) != 0);
|
||||
ok &= (CloseHandle(condition->signal_event_) != 0);
|
||||
#endif
|
||||
return !ok;
|
||||
}
|
||||
|
||||
static INLINE int pthread_cond_init(pthread_cond_t *const condition,
|
||||
void* cond_attr) {
|
||||
(void)cond_attr;
|
||||
#ifdef USE_WINDOWS_CONDITION_VARIABLE
|
||||
InitializeConditionVariable(condition);
|
||||
#else
|
||||
condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
|
||||
condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
|
||||
condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
|
||||
@@ -116,11 +155,15 @@ static INLINE int pthread_cond_init(pthread_cond_t *const condition,
|
||||
pthread_cond_destroy(condition);
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
|
||||
int ok = 1;
|
||||
#ifdef USE_WINDOWS_CONDITION_VARIABLE
|
||||
WakeConditionVariable(condition);
|
||||
#else
|
||||
if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
|
||||
// a thread is waiting in pthread_cond_wait: allow it to be notified
|
||||
ok = SetEvent(condition->signal_event_);
|
||||
@@ -129,12 +172,16 @@ static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
|
||||
ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
|
||||
WAIT_OBJECT_0);
|
||||
}
|
||||
#endif
|
||||
return !ok;
|
||||
}
|
||||
|
||||
static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
|
||||
pthread_mutex_t *const mutex) {
|
||||
int ok;
|
||||
#ifdef USE_WINDOWS_CONDITION_VARIABLE
|
||||
ok = SleepConditionVariableCS(condition, mutex, INFINITE);
|
||||
#else
|
||||
// note that there is a consumer available so the signal isn't dropped in
|
||||
// pthread_cond_signal
|
||||
if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
|
||||
@@ -145,6 +192,7 @@ static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
|
||||
WAIT_OBJECT_0);
|
||||
ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
|
||||
pthread_mutex_lock(mutex);
|
||||
#endif
|
||||
return !ok;
|
||||
}
|
||||
#elif defined(__OS2__)
|
||||
|
Reference in New Issue
Block a user