Support measure distortion in the pixel domain
Use pixel domain distortion metric in speed 0. This improves the compression performance by 0.3% for both low and high resolution test sets. Change-Id: I5b5b7115960de73f0b5e5d0c69db305e490e6f1d
This commit is contained in:
parent
14011f037d
commit
e357b9efe0
115
test/sum_squares_test.cc
Normal file
115
test/sum_squares_test.cc
Normal file
@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/util.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
const int kNumIterations = 10000;
|
||||
|
||||
typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);
|
||||
typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;
|
||||
|
||||
class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
|
||||
public:
|
||||
virtual ~SumSquaresTest() {}
|
||||
virtual void SetUp() {
|
||||
ref_func_ = GET_PARAM(0);
|
||||
tst_func_ = GET_PARAM(1);
|
||||
}
|
||||
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
SSI16Func ref_func_;
|
||||
SSI16Func tst_func_;
|
||||
};
|
||||
|
||||
TEST_P(SumSquaresTest, OperationCheck) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
|
||||
const int msb = 11; // Up to 12 bit input
|
||||
const int limit = 1 << (msb + 1);
|
||||
|
||||
for (int k = 0; k < kNumIterations; k++) {
|
||||
const int size = 4 << rnd(6); // Up to 128x128
|
||||
int stride = 4 << rnd(7); // Up to 256 stride
|
||||
while (stride < size) { // Make sure it's valid
|
||||
stride = 4 << rnd(7);
|
||||
}
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
for (int j = 0; j < size; ++j) {
|
||||
src[i * stride + j] = rnd(2) ? rnd(limit) : -rnd(limit);
|
||||
}
|
||||
}
|
||||
|
||||
const uint64_t res_ref = ref_func_(src, stride, size);
|
||||
uint64_t res_tst;
|
||||
ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
|
||||
|
||||
ASSERT_EQ(res_ref, res_tst)
|
||||
<< "Error: Sum Squares Test"
|
||||
<< " C output does not match optimized output.";
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(SumSquaresTest, ExtremeValues) {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
|
||||
const int msb = 11; // Up to 12 bit input
|
||||
const int limit = 1 << (msb + 1);
|
||||
|
||||
for (int k = 0; k < kNumIterations; k++) {
|
||||
const int size = 4 << rnd(6); // Up to 128x128
|
||||
int stride = 4 << rnd(7); // Up to 256 stride
|
||||
while (stride < size) { // Make sure it's valid
|
||||
stride = 4 << rnd(7);
|
||||
}
|
||||
|
||||
const int val = rnd(2) ? limit - 1 : -(limit - 1);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
for (int j = 0; j < size; ++j) {
|
||||
src[i * stride + j] = val;
|
||||
}
|
||||
}
|
||||
|
||||
const uint64_t res_ref = ref_func_(src, stride, size);
|
||||
uint64_t res_tst;
|
||||
ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
|
||||
|
||||
ASSERT_EQ(res_ref, res_tst)
|
||||
<< "Error: Sum Squares Test"
|
||||
<< " C output does not match optimized output.";
|
||||
}
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, SumSquaresTest,
|
||||
::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
|
||||
&vpx_sum_squares_2d_i16_sse2)));
|
||||
#endif // HAVE_SSE2
|
||||
} // namespace
|
@ -170,6 +170,7 @@ endif # VP9
|
||||
## Multi-codec / unconditional whitebox tests.
|
||||
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc
|
||||
|
||||
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
|
||||
TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
|
||||
|
@ -64,6 +64,7 @@ typedef struct {
|
||||
} REF_DEFINITION;
|
||||
|
||||
struct rdcost_block_args {
|
||||
const VP9_COMP *cpi;
|
||||
MACROBLOCK *x;
|
||||
ENTROPY_CONTEXT t_above[16];
|
||||
ENTROPY_CONTEXT t_left[16];
|
||||
@ -463,38 +464,123 @@ static int cost_coeffs(MACROBLOCK *x,
|
||||
return cost;
|
||||
}
|
||||
|
||||
static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
|
||||
static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, int block,
|
||||
int blk_row, int blk_col, TX_SIZE tx_size,
|
||||
int64_t *out_dist, int64_t *out_sse) {
|
||||
const int ss_txfrm_size = tx_size << 1;
|
||||
MACROBLOCKD* const xd = &x->e_mbd;
|
||||
const struct macroblock_plane *const p = &x->plane[plane];
|
||||
const struct macroblockd_plane *const pd = &xd->plane[plane];
|
||||
int64_t this_sse;
|
||||
int shift = tx_size == TX_32X32 ? 0 : 2;
|
||||
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
|
||||
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
|
||||
*out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff,
|
||||
16 << ss_txfrm_size,
|
||||
&this_sse, bd) >> shift;
|
||||
#else
|
||||
*out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
|
||||
&this_sse) >> shift;
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
*out_sse = this_sse >> shift;
|
||||
|
||||
if (x->skip_encode && !is_inter_block(xd->mi[0])) {
|
||||
// TODO(jingning): tune the model to better capture the distortion.
|
||||
int64_t p = (pd->dequant[1] * pd->dequant[1] *
|
||||
(1 << ss_txfrm_size)) >>
|
||||
if (cpi->sf.txfm_domain_distortion) {
|
||||
const int ss_txfrm_size = tx_size << 1;
|
||||
int64_t this_sse;
|
||||
const int shift = tx_size == TX_32X32 ? 0 : 2;
|
||||
const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
|
||||
const tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
(shift + 2 + (bd - 8) * 2);
|
||||
const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
|
||||
*out_dist = vp9_highbd_block_error_dispatch(
|
||||
coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd) >>
|
||||
shift;
|
||||
#else
|
||||
(shift + 2);
|
||||
*out_dist =
|
||||
vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >>
|
||||
shift;
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
*out_dist += (p >> 4);
|
||||
*out_sse += p;
|
||||
*out_sse = this_sse >> shift;
|
||||
|
||||
if (x->skip_encode && !is_inter_block(xd->mi[0])) {
|
||||
// TODO(jingning): tune the model to better capture the distortion.
|
||||
const int64_t p =
|
||||
(pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
(shift + 2 + (bd - 8) * 2);
|
||||
#else
|
||||
(shift + 2);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
*out_dist += (p >> 4);
|
||||
*out_sse += p;
|
||||
}
|
||||
} else {
|
||||
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
|
||||
const int bs = 4 * num_4x4_blocks_wide_lookup[tx_bsize];
|
||||
const int src_stride = p->src.stride;
|
||||
const int dst_stride = pd->dst.stride;
|
||||
const int src_idx = 4 * (blk_row * src_stride + blk_col);
|
||||
const int dst_idx = 4 * (blk_row * dst_stride + blk_col);
|
||||
const uint8_t *src = &p->src.buf[src_idx];
|
||||
const uint8_t *dst = &pd->dst.buf[dst_idx];
|
||||
const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
|
||||
const uint16_t *eob = &p->eobs[block];
|
||||
unsigned int tmp;
|
||||
|
||||
cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
|
||||
*out_sse = (int64_t)tmp * 16;
|
||||
|
||||
if (*eob) {
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
|
||||
uint8_t *recon = (uint8_t *)recon16;
|
||||
#else
|
||||
DECLARE_ALIGNED(16, uint8_t, recon[1024]);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||
recon = CONVERT_TO_BYTEPTR(recon);
|
||||
vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,
|
||||
bs, bs, xd->bd);
|
||||
if (xd->lossless) {
|
||||
vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
|
||||
} else {
|
||||
switch (tx_size) {
|
||||
case TX_4X4:
|
||||
vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
|
||||
break;
|
||||
case TX_8X8:
|
||||
vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
|
||||
break;
|
||||
case TX_16X16:
|
||||
vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
|
||||
break;
|
||||
case TX_32X32:
|
||||
vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Invalid transform size");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
|
||||
switch (tx_size) {
|
||||
case TX_32X32:
|
||||
vp9_idct32x32_add(dqcoeff, recon, 32, *eob);
|
||||
break;
|
||||
case TX_16X16:
|
||||
vp9_idct16x16_add(dqcoeff, recon, 32, *eob);
|
||||
break;
|
||||
case TX_8X8:
|
||||
vp9_idct8x8_add(dqcoeff, recon, 32, *eob);
|
||||
break;
|
||||
case TX_4X4:
|
||||
// this is like vp9_short_idct4x4 but has a special case around
|
||||
// eob<=1, which is significant (not just an optimization) for
|
||||
// the lossless case.
|
||||
x->itxm_add(dqcoeff, recon, 32, *eob);
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Invalid transform size");
|
||||
break;
|
||||
}
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, 32, &tmp);
|
||||
}
|
||||
|
||||
*out_dist = (int64_t)tmp * 16;
|
||||
}
|
||||
}
|
||||
|
||||
@ -506,9 +592,8 @@ static int rate_block(int plane, int block, int row, int col,
|
||||
args->use_fast_coef_costing);
|
||||
}
|
||||
|
||||
static void block_rd_txfm(int plane, int block, int row, int col,
|
||||
BLOCK_SIZE plane_bsize,
|
||||
TX_SIZE tx_size, void *arg) {
|
||||
static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
|
||||
struct rdcost_block_args *args = arg;
|
||||
MACROBLOCK *const x = args->x;
|
||||
MACROBLOCKD *const xd = &x->e_mbd;
|
||||
@ -523,20 +608,47 @@ static void block_rd_txfm(int plane, int block, int row, int col,
|
||||
|
||||
if (!is_inter_block(mi)) {
|
||||
struct encode_b_args arg = {x, NULL, &mi->skip};
|
||||
vp9_encode_block_intra(plane, block, row, col, plane_bsize, tx_size, &arg);
|
||||
dist_block(x, plane, block, tx_size, &dist, &sse);
|
||||
vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
|
||||
&arg);
|
||||
if (args->cpi->sf.txfm_domain_distortion) {
|
||||
dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
|
||||
&sse);
|
||||
} else {
|
||||
const int bs = 4 << tx_size;
|
||||
const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
|
||||
const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
|
||||
const struct macroblock_plane *const p = &x->plane[plane];
|
||||
const struct macroblockd_plane *const pd = &xd->plane[plane];
|
||||
const int src_stride = p->src.stride;
|
||||
const int dst_stride = pd->dst.stride;
|
||||
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
|
||||
const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
|
||||
const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
|
||||
const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
|
||||
unsigned int tmp;
|
||||
sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
|
||||
sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
sse = sse * 16;
|
||||
variance(src, src_stride, dst, dst_stride, &tmp);
|
||||
dist = (int64_t)tmp * 16;
|
||||
}
|
||||
} else if (max_txsize_lookup[plane_bsize] == tx_size) {
|
||||
if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
|
||||
SKIP_TXFM_NONE) {
|
||||
// full forward transform and quantization
|
||||
vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
|
||||
dist_block(x, plane, block, tx_size, &dist, &sse);
|
||||
vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
|
||||
dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
|
||||
&sse);
|
||||
} else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
|
||||
SKIP_TXFM_AC_ONLY) {
|
||||
// compute DC coefficient
|
||||
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
|
||||
tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
|
||||
vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size);
|
||||
vp9_xform_quant_dc(x, plane, block, blk_row, blk_col, plane_bsize,
|
||||
tx_size);
|
||||
sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
|
||||
dist = sse;
|
||||
if (x->plane[plane].eobs[block]) {
|
||||
@ -560,8 +672,9 @@ static void block_rd_txfm(int plane, int block, int row, int col,
|
||||
}
|
||||
} else {
|
||||
// full forward transform and quantization
|
||||
vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
|
||||
dist_block(x, plane, block, tx_size, &dist, &sse);
|
||||
vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
|
||||
dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
|
||||
&sse);
|
||||
}
|
||||
|
||||
rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
|
||||
@ -570,7 +683,7 @@ static void block_rd_txfm(int plane, int block, int row, int col,
|
||||
return;
|
||||
}
|
||||
|
||||
rate = rate_block(plane, block, row, col, tx_size, args);
|
||||
rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);
|
||||
rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
|
||||
rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
|
||||
|
||||
@ -593,16 +706,15 @@ static void block_rd_txfm(int plane, int block, int row, int col,
|
||||
args->skippable &= !x->plane[plane].eobs[block];
|
||||
}
|
||||
|
||||
static void txfm_rd_in_plane(MACROBLOCK *x,
|
||||
int *rate, int64_t *distortion,
|
||||
int *skippable, int64_t *sse,
|
||||
int64_t ref_best_rd, int plane,
|
||||
BLOCK_SIZE bsize, TX_SIZE tx_size,
|
||||
int use_fast_coef_casting) {
|
||||
static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
|
||||
int64_t *distortion, int *skippable, int64_t *sse,
|
||||
int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
|
||||
TX_SIZE tx_size, int use_fast_coef_casting) {
|
||||
MACROBLOCKD *const xd = &x->e_mbd;
|
||||
const struct macroblockd_plane *const pd = &xd->plane[plane];
|
||||
struct rdcost_block_args args;
|
||||
vp9_zero(args);
|
||||
args.cpi = cpi;
|
||||
args.x = x;
|
||||
args.best_rd = ref_best_rd;
|
||||
args.use_fast_coef_costing = use_fast_coef_casting;
|
||||
@ -643,8 +755,7 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
|
||||
|
||||
mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
|
||||
|
||||
txfm_rd_in_plane(x, rate, distortion, skip,
|
||||
sse, ref_best_rd, 0, bs,
|
||||
txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
|
||||
mi->tx_size, cpi->sf.use_fast_coef_costing);
|
||||
}
|
||||
|
||||
@ -695,9 +806,8 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
|
||||
else
|
||||
r_tx_size += vp9_cost_one(tx_probs[m]);
|
||||
}
|
||||
txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
|
||||
&sse[n], ref_best_rd, 0, bs, n,
|
||||
cpi->sf.use_fast_coef_costing);
|
||||
txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,
|
||||
bs, n, cpi->sf.use_fast_coef_costing);
|
||||
r[n][1] = r[n][0];
|
||||
if (r[n][0] < INT_MAX) {
|
||||
r[n][1] += r_tx_size;
|
||||
@ -1172,9 +1282,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
|
||||
*skippable = 1;
|
||||
|
||||
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
|
||||
txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
|
||||
ref_best_rd, plane, bsize, uv_tx_size,
|
||||
cpi->sf.use_fast_coef_costing);
|
||||
txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
|
||||
plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing);
|
||||
if (pnrate == INT_MAX) {
|
||||
is_cost_valid = 0;
|
||||
break;
|
||||
|
@ -162,6 +162,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
|
||||
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
|
||||
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
|
||||
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
|
||||
sf->txfm_domain_distortion = 1;
|
||||
}
|
||||
|
||||
if (speed >= 2) {
|
||||
@ -279,6 +280,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
|
||||
sf->exhaustive_searches_thresh = INT_MAX;
|
||||
|
||||
if (speed >= 1) {
|
||||
sf->txfm_domain_distortion = 1;
|
||||
sf->use_square_partition_only = !frame_is_intra_only(cm);
|
||||
sf->less_rectangular_check = 1;
|
||||
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
|
||||
@ -541,6 +543,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
|
||||
sf->disable_filter_search_var_thresh = 0;
|
||||
sf->adaptive_interp_filter_search = 0;
|
||||
sf->allow_partition_search_skip = 0;
|
||||
sf->txfm_domain_distortion = 0;
|
||||
|
||||
for (i = 0; i < TX_SIZES; i++) {
|
||||
sf->intra_y_mode_mask[i] = INTRA_ALL;
|
||||
|
@ -246,6 +246,11 @@ typedef struct SPEED_FEATURES {
|
||||
// Coefficient probability model approximation step size
|
||||
int coeff_prob_appx_step;
|
||||
|
||||
// Use transform domain distortion. Use pixel domain distortion when
|
||||
// this flag is set to be zero. The pixel domain distortion computation
|
||||
// improves the distortion metric precision.
|
||||
int txfm_domain_distortion;
|
||||
|
||||
// The threshold is to determine how slow the motino is, it is used when
|
||||
// use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
|
||||
MOTION_THRESHOLD lf_motion_threshold;
|
||||
|
27
vpx_dsp/sum_squares.c
Normal file
27
vpx_dsp/sum_squares.c
Normal file
@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
|
||||
int size) {
|
||||
int r, c;
|
||||
uint64_t ss = 0;
|
||||
|
||||
for (r = 0; r < size; r++) {
|
||||
for (c = 0; c < size; c++) {
|
||||
const int16_t v = src[c];
|
||||
ss += v * v;
|
||||
}
|
||||
src += src_stride;
|
||||
}
|
||||
|
||||
return ss;
|
||||
}
|
@ -277,6 +277,8 @@ endif # CONFIG_VP9_ENCODER
|
||||
ifeq ($(CONFIG_ENCODERS),yes)
|
||||
DSP_SRCS-yes += sad.c
|
||||
DSP_SRCS-yes += subtract.c
|
||||
DSP_SRCS-yes += sum_squares.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
|
||||
|
@ -1169,6 +1169,9 @@ specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
|
||||
add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
|
||||
specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
|
||||
specialize qw/vpx_sum_squares_2d_i16 sse2/;
|
||||
|
||||
#
|
||||
# Structured Similarity (SSIM)
|
||||
#
|
||||
|
128
vpx_dsp/x86/sum_squares_sse2.c
Normal file
128
vpx_dsp/x86/sum_squares_sse2.c
Normal file
@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <emmintrin.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
|
||||
int stride) {
|
||||
const __m128i v_val_0_w =
|
||||
_mm_loadl_epi64((const __m128i *)(src + 0 * stride));
|
||||
const __m128i v_val_1_w =
|
||||
_mm_loadl_epi64((const __m128i *)(src + 1 * stride));
|
||||
const __m128i v_val_2_w =
|
||||
_mm_loadl_epi64((const __m128i *)(src + 2 * stride));
|
||||
const __m128i v_val_3_w =
|
||||
_mm_loadl_epi64((const __m128i *)(src + 3 * stride));
|
||||
|
||||
const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
|
||||
const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
|
||||
const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
|
||||
const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
|
||||
|
||||
const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
|
||||
const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
|
||||
const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
|
||||
|
||||
const __m128i v_sum_d =
|
||||
_mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
|
||||
|
||||
return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
|
||||
}
|
||||
|
||||
// TODO(jingning): Evaluate the performance impact here.
|
||||
#ifdef __GNUC__
|
||||
// This prevents GCC/Clang from inlining this function into
|
||||
// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
|
||||
// maintenance instructions in the common case of 4x4.
|
||||
__attribute__((noinline))
|
||||
#endif
|
||||
static uint64_t
|
||||
vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
|
||||
int r, c;
|
||||
const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
|
||||
__m128i v_acc_q = _mm_setzero_si128();
|
||||
|
||||
for (r = 0; r < size; r += 8) {
|
||||
__m128i v_acc_d = _mm_setzero_si128();
|
||||
|
||||
for (c = 0; c < size; c += 8) {
|
||||
const int16_t *b = src + c;
|
||||
const __m128i v_val_0_w =
|
||||
_mm_load_si128((const __m128i *)(b + 0 * stride));
|
||||
const __m128i v_val_1_w =
|
||||
_mm_load_si128((const __m128i *)(b + 1 * stride));
|
||||
const __m128i v_val_2_w =
|
||||
_mm_load_si128((const __m128i *)(b + 2 * stride));
|
||||
const __m128i v_val_3_w =
|
||||
_mm_load_si128((const __m128i *)(b + 3 * stride));
|
||||
const __m128i v_val_4_w =
|
||||
_mm_load_si128((const __m128i *)(b + 4 * stride));
|
||||
const __m128i v_val_5_w =
|
||||
_mm_load_si128((const __m128i *)(b + 5 * stride));
|
||||
const __m128i v_val_6_w =
|
||||
_mm_load_si128((const __m128i *)(b + 6 * stride));
|
||||
const __m128i v_val_7_w =
|
||||
_mm_load_si128((const __m128i *)(b + 7 * stride));
|
||||
|
||||
const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
|
||||
const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
|
||||
const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
|
||||
const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
|
||||
const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
|
||||
const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
|
||||
const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
|
||||
const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
|
||||
|
||||
const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
|
||||
const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
|
||||
const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
|
||||
const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
|
||||
|
||||
const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
|
||||
const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
|
||||
|
||||
v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
|
||||
v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
|
||||
}
|
||||
|
||||
v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
|
||||
v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
|
||||
|
||||
src += 8 * stride;
|
||||
}
|
||||
|
||||
v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
|
||||
|
||||
#if ARCH_X86_64
|
||||
return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
|
||||
#else
|
||||
{
|
||||
uint64_t tmp;
|
||||
_mm_storel_epi64((__m128i *)&tmp, v_acc_q);
|
||||
return tmp;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
|
||||
// 4 elements per row only requires half an XMM register, so this
|
||||
// must be a special case, but also note that over 75% of all calls
|
||||
// are with size == 4, so it is also the common case.
|
||||
if (size == 4) {
|
||||
return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
|
||||
} else {
|
||||
// Generic case
|
||||
return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user