Added highbitdepth sse2 acceleration for quantize and block error
This is a partial cherry-pick of db7192e Change-Id: Idef18f90b111a0d0c9546543d3347e551908fd78
This commit is contained in:
parent
16add99f0d
commit
d6153aa447
146
test/error_block_test.cc
Normal file
146
test/error_block_test.cc
Normal file
@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/util.h"
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vp9_rtcd.h"
|
||||
#include "vp9/common/vp9_entropy.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const int number_of_iterations = 1000;
|
||||
|
||||
typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
|
||||
const tran_low_t *dqcoeff, intptr_t block_size,
|
||||
int64_t *ssz, int bps);
|
||||
typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>
|
||||
ErrorBlockParam;
|
||||
class ErrorBlockTest
|
||||
: public ::testing::TestWithParam<ErrorBlockParam> {
|
||||
public:
|
||||
virtual ~ErrorBlockTest() {}
|
||||
virtual void SetUp() {
|
||||
error_block_op_ = GET_PARAM(0);
|
||||
ref_error_block_op_ = GET_PARAM(1);
|
||||
bit_depth_ = GET_PARAM(2);
|
||||
}
|
||||
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
vpx_bit_depth_t bit_depth_;
|
||||
ErrorBlockFunc error_block_op_;
|
||||
ErrorBlockFunc ref_error_block_op_;
|
||||
};
|
||||
|
||||
TEST_P(ErrorBlockTest, OperationCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
intptr_t block_size;
|
||||
int64_t ssz;
|
||||
int64_t ret;
|
||||
int64_t ref_ssz;
|
||||
int64_t ref_ret;
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int err_count = 0;
|
||||
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
|
||||
for (int j = 0; j < block_size; j++) {
|
||||
coeff[j] = rnd(2<<20)-(1<<20);
|
||||
dqcoeff[j] = rnd(2<<20)-(1<<20);
|
||||
}
|
||||
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
|
||||
bit_depth_);
|
||||
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
|
||||
&ssz, bit_depth_));
|
||||
err_count += (ref_ret != ret) | (ref_ssz != ssz);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
|
||||
TEST_P(ErrorBlockTest, ExtremeValues) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, 4096);
|
||||
DECLARE_ALIGNED_ARRAY(16, tran_low_t, dqcoeff, 4096);
|
||||
int err_count_total = 0;
|
||||
int first_failure = -1;
|
||||
intptr_t block_size;
|
||||
int64_t ssz;
|
||||
int64_t ret;
|
||||
int64_t ref_ssz;
|
||||
int64_t ref_ret;
|
||||
int max_val = ((1<<20)-1);
|
||||
for (int i = 0; i < number_of_iterations; ++i) {
|
||||
int err_count = 0;
|
||||
int k = (i / 9) % 5;
|
||||
|
||||
// Change the maximum coeff value, to test different bit boundaries
|
||||
if ( k == 4 && (i % 9) == 0 ) {
|
||||
max_val >>= 1;
|
||||
}
|
||||
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
|
||||
for (int j = 0; j < block_size; j++) {
|
||||
if (k < 4) { // Test at maximum values
|
||||
coeff[j] = k % 2 ? max_val : -max_val;
|
||||
dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
|
||||
} else {
|
||||
coeff[j] = rnd(2 << 14) - (1 << 14);
|
||||
dqcoeff[j] = rnd(2 << 14) - (1 << 14);
|
||||
}
|
||||
}
|
||||
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
|
||||
bit_depth_);
|
||||
ASM_REGISTER_STATE_CHECK(ret = error_block_op_(coeff, dqcoeff, block_size,
|
||||
&ssz, bit_depth_));
|
||||
err_count += (ref_ret != ret) | (ref_ssz != ssz);
|
||||
if (err_count && !err_count_total) {
|
||||
first_failure = i;
|
||||
}
|
||||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2_C_COMPARE, ErrorBlockTest,
|
||||
::testing::Values(
|
||||
make_tuple(&vp9_highbd_block_error_sse2,
|
||||
&vp9_highbd_block_error_c, VPX_BITS_10),
|
||||
make_tuple(&vp9_highbd_block_error_sse2,
|
||||
&vp9_highbd_block_error_c, VPX_BITS_12),
|
||||
make_tuple(&vp9_highbd_block_error_sse2,
|
||||
&vp9_highbd_block_error_c, VPX_BITS_8)));
|
||||
#endif // HAVE_SSE2
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
} // namespace
|
@ -136,6 +136,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += lpf_8_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += error_block_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc
|
||||
|
||||
ifeq ($(CONFIG_VP9_ENCODER),yes)
|
||||
|
@ -2426,7 +2426,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
# ENCODEMB INVOKE
|
||||
|
||||
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
|
||||
specialize qw/vp9_highbd_block_error/;
|
||||
specialize qw/vp9_highbd_block_error sse2/;
|
||||
|
||||
add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
|
||||
specialize qw/vp9_highbd_subtract_block/;
|
||||
|
71
vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
Normal file
71
vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
Normal file
@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "vp9/common/vp9_common.h"
|
||||
|
||||
int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff,
|
||||
tran_low_t *dqcoeff, intptr_t block_size,
|
||||
int64_t *ssz, int bps) {
|
||||
int i, j, test;
|
||||
uint32_t temp[4];
|
||||
__m128i max, min, cmp0, cmp1, cmp2, cmp3;
|
||||
int64_t error = 0, sqcoeff = 0;
|
||||
int shift = 2 * (bps - 8);
|
||||
int rounding = shift > 0 ? 1 << (shift - 1) : 0;
|
||||
|
||||
for (i = 0; i < block_size; i+=8) {
|
||||
// Load the data into xmm registers
|
||||
__m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
|
||||
__m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
|
||||
__m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
|
||||
__m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
|
||||
// Check if any values require more than 15 bit
|
||||
max = _mm_set1_epi32(0x3fff);
|
||||
min = _mm_set1_epi32(0xffffc000);
|
||||
cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
|
||||
_mm_cmplt_epi32(mm_coeff, min));
|
||||
cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
|
||||
_mm_cmplt_epi32(mm_coeff2, min));
|
||||
cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
|
||||
_mm_cmplt_epi32(mm_dqcoeff, min));
|
||||
cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
|
||||
_mm_cmplt_epi32(mm_dqcoeff2, min));
|
||||
test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
|
||||
_mm_or_si128(cmp2, cmp3)));
|
||||
|
||||
if (!test) {
|
||||
__m128i mm_diff, error_sse2, sqcoeff_sse2;;
|
||||
mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
|
||||
mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
|
||||
mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
|
||||
error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
|
||||
sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
|
||||
_mm_storeu_si128((__m128i*)temp, error_sse2);
|
||||
error = error + temp[0] + temp[1] + temp[2] + temp[3];
|
||||
_mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
|
||||
sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
|
||||
} else {
|
||||
for (j = 0; j < 8; j++) {
|
||||
const int64_t diff = coeff[i+j] - dqcoeff[i+j];
|
||||
error += diff * diff;
|
||||
sqcoeff += (int64_t)coeff[i+j] * (int64_t)coeff[i+j];
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(error >= 0 && sqcoeff >= 0);
|
||||
error = (error + rounding) >> shift;
|
||||
sqcoeff = (sqcoeff + rounding) >> shift;
|
||||
|
||||
*ssz = sqcoeff;
|
||||
return error;
|
||||
}
|
@ -117,6 +117,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_USE_X86INC),yes)
|
||||
|
Loading…
x
Reference in New Issue
Block a user