Convolution horizontal filter SSSE3 optimization
- Apply signal direction/4-pixel vertical/8-pixel vertical parallelism. - Add unit test to verify the bit exact result. - Overall encoding time improves ~24% on Xeon E5-2680 CPU. Change-Id: I104dcbfd43451476fee1f94cd16ca5f965878e59
This commit is contained in:
@@ -207,6 +207,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm2d_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_convolve_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_convolve_optimz_test.cc
|
||||
|
||||
TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
|
||||
TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
|
||||
|
215
test/vp10_convolve_optimz_test.cc
Normal file
215
test/vp10_convolve_optimz_test.cc
Normal file
@@ -0,0 +1,215 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
|
||||
#include "./vp10_rtcd.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/util.h"
|
||||
|
||||
namespace {
|
||||
|
||||
using std::tr1::tuple;
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
typedef void (*conv_horiz_t)(const uint8_t*, int, uint8_t*, int,
|
||||
int, int, const InterpFilterParams,
|
||||
const int, int, int);
|
||||
// Test parameter list:
|
||||
// <convolve_horiz_func, <width, height>, filter_params, subpel_x_q4, avg>
|
||||
typedef tuple<int, int> BlockDimension;
|
||||
typedef tuple<conv_horiz_t, BlockDimension, INTERP_FILTER, int, int> ConvParams;
|
||||
|
||||
// Note:
|
||||
// src_ and src_ref_ have special boundary requirement
|
||||
// dst_ and dst_ref_ don't
|
||||
const size_t maxWidth = 256;
|
||||
const size_t maxHeight = 256;
|
||||
const size_t maxBlockSize = maxWidth * maxHeight;
|
||||
const int horizOffset = 32;
|
||||
const int vertiOffset = 32;
|
||||
const int stride = 128;
|
||||
const int x_step_q4 = 16;
|
||||
|
||||
class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
|
||||
public:
|
||||
virtual ~VP10ConvolveOptimzTest() {}
|
||||
virtual void SetUp() {
|
||||
conv_ = GET_PARAM(0);
|
||||
BlockDimension block = GET_PARAM(1);
|
||||
width_ = std::tr1::get<0>(block);
|
||||
height_ = std::tr1::get<1>(block);
|
||||
filter_ = GET_PARAM(2);
|
||||
subpel_ = GET_PARAM(3);
|
||||
avg_ = GET_PARAM(4);
|
||||
|
||||
alloc_ = new uint8_t[maxBlockSize * 4];
|
||||
src_ = alloc_ + (vertiOffset * maxWidth);
|
||||
src_ += horizOffset;
|
||||
src_ref_ = src_ + maxBlockSize;
|
||||
|
||||
dst_ = alloc_ + 2 * maxBlockSize;
|
||||
dst_ref_ = alloc_ + 3 * maxBlockSize;
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
delete[] alloc_;
|
||||
libvpx_test::ClearSystemState();
|
||||
}
|
||||
|
||||
protected:
|
||||
void RunHorizFilterBitExactCheck();
|
||||
|
||||
private:
|
||||
void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
|
||||
uint8_t *dst, uint8_t *dst_ref,
|
||||
int w, int h);
|
||||
void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
|
||||
int w, int h, int fgroup, int findex);
|
||||
conv_horiz_t conv_;
|
||||
uint8_t *alloc_;
|
||||
uint8_t *src_;
|
||||
uint8_t *dst_;
|
||||
uint8_t *src_ref_;
|
||||
uint8_t *dst_ref_;
|
||||
int width_;
|
||||
int height_;
|
||||
int filter_;
|
||||
int subpel_;
|
||||
int avg_;
|
||||
};
|
||||
|
||||
void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
|
||||
uint8_t *dst, uint8_t *dst_ref,
|
||||
int w, int h) {
|
||||
int r, c;
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
|
||||
memset(src, 0, maxBlockSize);
|
||||
memset(src_ref, 0, maxBlockSize);
|
||||
memset(dst, 0, maxBlockSize);
|
||||
memset(dst_ref, 0, maxBlockSize);
|
||||
|
||||
uint8_t *src_ptr = src;
|
||||
uint8_t *dst_ptr = dst;
|
||||
uint8_t *src_ref_ptr = src_ref;
|
||||
uint8_t *dst_ref_ptr = dst_ref;
|
||||
|
||||
for (r = 0; r < height_; ++r) {
|
||||
for (c = 0; c < width_; ++c) {
|
||||
src_ptr[c] = rnd.Rand8();
|
||||
src_ref_ptr[c] = src_ptr[c];
|
||||
dst_ptr[c] = rnd.Rand8();
|
||||
dst_ref_ptr[c] = dst_ptr[c];
|
||||
}
|
||||
src_ptr += stride;
|
||||
src_ref_ptr += stride;
|
||||
dst_ptr += stride;
|
||||
dst_ref_ptr += stride;
|
||||
}
|
||||
}
|
||||
|
||||
void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
|
||||
const uint8_t *buf_ref,
|
||||
int w, int h,
|
||||
int filter_group,
|
||||
int filter_index) {
|
||||
int r, c;
|
||||
const uint8_t *dst_ptr = buf;
|
||||
const uint8_t *dst_ref_ptr = buf_ref;
|
||||
for (r = 0; r < h; ++r) {
|
||||
for (c = 0; c < w; ++c) {
|
||||
EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
|
||||
<< "Error at row: " << r << " col: " << c << " "
|
||||
<< "w = " << w << " " << "h = " << h << " "
|
||||
<< "filter group index = " << filter_group << " "
|
||||
<< "filter index = " << filter_index;
|
||||
}
|
||||
dst_ptr += stride;
|
||||
dst_ref_ptr += stride;
|
||||
}
|
||||
}
|
||||
|
||||
void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
|
||||
|
||||
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
||||
|
||||
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
|
||||
filter_params, subpel_, x_step_q4, avg_);
|
||||
|
||||
conv_(src_, stride, dst_, stride, width_, height_,
|
||||
filter_params, subpel_, x_step_q4, avg_);
|
||||
|
||||
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
|
||||
|
||||
// Note:
|
||||
// Here we need calculate a height which is different from the specified one
|
||||
// and test again.
|
||||
int intermediate_height =
|
||||
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
|
||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
|
||||
|
||||
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
|
||||
intermediate_height, filter_params, subpel_, x_step_q4,
|
||||
avg_);
|
||||
|
||||
conv_(src_, stride, dst_, stride, width_,
|
||||
intermediate_height, filter_params, subpel_, x_step_q4,
|
||||
avg_);
|
||||
|
||||
DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
|
||||
subpel_);
|
||||
}
|
||||
|
||||
TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
|
||||
RunHorizFilterBitExactCheck();
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
const BlockDimension kBlockDim[] = {
|
||||
make_tuple(4, 4),
|
||||
make_tuple(4, 8),
|
||||
make_tuple(8, 4),
|
||||
make_tuple(8, 8),
|
||||
make_tuple(8, 16),
|
||||
make_tuple(16, 8),
|
||||
make_tuple(16, 16),
|
||||
make_tuple(16, 32),
|
||||
make_tuple(32, 16),
|
||||
make_tuple(32, 32),
|
||||
make_tuple(32, 64),
|
||||
make_tuple(64, 32),
|
||||
make_tuple(64, 64),
|
||||
make_tuple(64, 128),
|
||||
make_tuple(128, 64),
|
||||
make_tuple(128, 128),
|
||||
};
|
||||
// 10/12-tap filters
|
||||
const INTERP_FILTER kFilter[] = {6, 4, 2};
|
||||
|
||||
const int kSubpelXQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
|
||||
const int kAvg[] = {0, 1};
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_EXT_INTERP
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSSE3, VP10ConvolveOptimzTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(vp10_convolve_horiz_ssse3),
|
||||
::testing::ValuesIn(kBlockDim),
|
||||
::testing::ValuesIn(kFilter),
|
||||
::testing::ValuesIn(kSubpelXQ4),
|
||||
::testing::ValuesIn(kAvg)));
|
||||
#endif // HAVE_SSSE3 && CONFIG_EXT_INTERP
|
||||
} // namespace
|
@@ -1,5 +1,6 @@
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
|
||||
#include "./vp10_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "test/acm_random.h"
|
||||
#include "vp10/common/filter.h"
|
||||
@@ -40,6 +41,8 @@ TEST(VP10ConvolveTest, vp10_convolve8) {
|
||||
int w = 1;
|
||||
int h = 1;
|
||||
|
||||
vp10_rtcd();
|
||||
|
||||
for (int i = 0; i < filter_size * filter_size; i++) {
|
||||
src[i] = rnd.Rand16() % (1 << 8);
|
||||
}
|
||||
@@ -86,6 +89,8 @@ TEST(VP10ConvolveTest, vp10_convolve) {
|
||||
int subpel_x_q4;
|
||||
int subpel_y_q4;
|
||||
|
||||
vp10_rtcd();
|
||||
|
||||
for (int i = 0; i < filter_size * filter_size; i++) {
|
||||
src[i] = rnd.Rand16() % (1 << 8);
|
||||
}
|
||||
@@ -150,6 +155,8 @@ TEST(VP10ConvolveTest, vp10_convolve_avg) {
|
||||
int subpel_x_q4;
|
||||
int subpel_y_q4;
|
||||
|
||||
vp10_rtcd();
|
||||
|
||||
for (int i = 0; i < filter_size * filter_size; i++) {
|
||||
src0[i] = rnd.Rand16() % (1 << 8);
|
||||
src1[i] = rnd.Rand16() % (1 << 8);
|
||||
|
@@ -302,3 +302,43 @@ const int16_t *vp10_get_interp_filter_kernel(
|
||||
return (const int16_t*)
|
||||
vp10_interp_filter_params_list[interp_filter].filter_ptr;
|
||||
}
|
||||
|
||||
SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
|
||||
const InterpFilterParams p, int index) {
|
||||
#if CONFIG_EXT_INTERP && HAVE_SSSE3
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
|
||||
return &sub_pel_filters_12sharp_signal_dir[index][0];
|
||||
}
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
|
||||
return &sub_pel_filters_10sharp_signal_dir[index][0];
|
||||
}
|
||||
#endif
|
||||
#if USE_TEMPORALFILTER_12TAP && HAVE_SSSE3
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
|
||||
return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
|
||||
}
|
||||
#endif
|
||||
(void)p;
|
||||
(void)index;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
|
||||
const InterpFilterParams p, int index) {
|
||||
#if CONFIG_EXT_INTERP && HAVE_SSSE3
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
|
||||
return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
|
||||
}
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
|
||||
return &sub_pel_filters_10sharp_ver_signal_dir[index][0];
|
||||
}
|
||||
#endif
|
||||
#if USE_TEMPORALFILTER_12TAP && HAVE_SSSE3
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
|
||||
return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
|
||||
}
|
||||
#endif
|
||||
(void)p;
|
||||
(void)index;
|
||||
return NULL;
|
||||
}
|
||||
|
@@ -91,6 +91,27 @@ static INLINE int vp10_is_interpolating_filter(
|
||||
const InterpFilterParams ip = vp10_get_interp_filter_params(interp_filter);
|
||||
return (ip.filter_ptr[ip.taps / 2 - 1] == 128);
|
||||
}
|
||||
|
||||
#if USE_TEMPORALFILTER_12TAP
|
||||
extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16];
|
||||
extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16];
|
||||
#endif
|
||||
|
||||
#if CONFIG_EXT_INTERP
|
||||
extern const int8_t sub_pel_filters_12sharp_signal_dir[15][2][16];
|
||||
extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16];
|
||||
extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16];
|
||||
extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16];
|
||||
#endif
|
||||
|
||||
typedef const int8_t (*SubpelFilterCoeffs)[16];
|
||||
|
||||
SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
|
||||
const InterpFilterParams p, int index);
|
||||
|
||||
SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
|
||||
const InterpFilterParams p, int index);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
@@ -1,6 +1,7 @@
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "./vp10_rtcd.h"
|
||||
#include "vp10/common/filter.h"
|
||||
#include "vpx_dsp/vpx_dsp_common.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
@@ -10,7 +11,7 @@
|
||||
#define MAX_STEP (32)
|
||||
#define MAX_FILTER_TAP (12)
|
||||
|
||||
static void convolve_horiz(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
void vp10_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
int dst_stride, int w, int h,
|
||||
const InterpFilterParams filter_params,
|
||||
const int subpel_x_q4, int x_step_q4, int avg) {
|
||||
@@ -121,8 +122,8 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
vp10_get_interp_filter_params(interp_filter);
|
||||
#endif
|
||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||
convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_x_q4, x_step_q4, ref_idx);
|
||||
vp10_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_x_q4, x_step_q4, ref_idx);
|
||||
} else if (ignore_horiz) {
|
||||
#if CONFIG_DUAL_FILTER
|
||||
InterpFilterParams filter_params =
|
||||
@@ -162,9 +163,9 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
|
||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||
|
||||
convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
|
||||
temp_stride, w, intermediate_height, filter_params,
|
||||
subpel_x_q4, x_step_q4, 0);
|
||||
vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
|
||||
temp, temp_stride, w, intermediate_height,
|
||||
filter_params, subpel_x_q4, x_step_q4, 0);
|
||||
|
||||
#if CONFIG_DUAL_FILTER
|
||||
filter_params = filter_params_y;
|
||||
|
@@ -8,6 +8,7 @@ print <<EOF
|
||||
#include "vp10/common/common.h"
|
||||
#include "vp10/common/enums.h"
|
||||
#include "vp10/common/quant_common.h"
|
||||
#include "vp10/common/filter.h"
|
||||
#include "vp10/common/vp10_txfm.h"
|
||||
|
||||
struct macroblockd;
|
||||
@@ -83,6 +84,12 @@ add_proto qw/void vp10_filter_by_weight8x8/, "const uint8_t *src, int src_stride
|
||||
specialize qw/vp10_filter_by_weight8x8 sse2 msa/;
|
||||
}
|
||||
|
||||
#
|
||||
# 10/12-tap convolution filters
|
||||
#
|
||||
add_proto qw/void vp10_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
|
||||
specialize qw/vp10_convolve_horiz ssse3/;
|
||||
|
||||
#
|
||||
# dct
|
||||
#
|
||||
|
632
vp10/common/x86/vp10_convolve_filters_ssse3.c
Normal file
632
vp10/common/x86/vp10_convolve_filters_ssse3.c
Normal file
@@ -0,0 +1,632 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "vp10/common/filter.h"
|
||||
|
||||
// Note:
|
||||
// Filter coefficients are from "filter.c". We use,
|
||||
// sub_pel_filters_temporalfilter_12[],
|
||||
// sub_pel_filters_12sharp[],
|
||||
// sub_pel_filters_10sharp[].
|
||||
|
||||
// (2-1) Parallel filtering along the intended signal direction
|
||||
|
||||
// 12-tap filter padding:
|
||||
// {filter_coefficients, 0, 0, 0, 0},
|
||||
// {0, 0, filter_coefficients, 0, 0),
|
||||
#if USE_TEMPORALFILTER_12TAP
|
||||
DECLARE_ALIGNED(16, const int8_t,
|
||||
sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]) = {
|
||||
{
|
||||
{0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0, 0, 0},
|
||||
},
|
||||
};
|
||||
#endif // USE_TEMPORALFILTER_12TAP
|
||||
|
||||
#if CONFIG_EXT_INTERP
|
||||
DECLARE_ALIGNED(16, const int8_t,
|
||||
sub_pel_filters_12sharp_signal_dir[15][2][16]) = {
|
||||
{
|
||||
{0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0, 0, 0},
|
||||
{0, 0, -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0},
|
||||
},
|
||||
{
|
||||
{-2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0, 0, 0},
|
||||
{0, 0, -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0},
|
||||
},
|
||||
{
|
||||
{-2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0, 0, 0},
|
||||
{0, 0, -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0},
|
||||
},
|
||||
{
|
||||
{-2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0, 0, 0},
|
||||
{0, 0, -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0},
|
||||
},
|
||||
{
|
||||
{-2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0, 0, 0},
|
||||
{0, 0, -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0},
|
||||
},
|
||||
{
|
||||
{-2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0, 0, 0},
|
||||
{0, 0, -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0},
|
||||
},
|
||||
{
|
||||
{-2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0, 0, 0},
|
||||
{0, 0, -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0, 0, 0},
|
||||
{0, 0, -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0},
|
||||
},
|
||||
};
|
||||
|
||||
// 10-tap filter padding:
|
||||
// {0, filter_coefficients, 0, 0, 0, 0, 0},
|
||||
// {0, 0, 0, filter_coefficients, 0, 0, 0),
|
||||
DECLARE_ALIGNED(16, const int8_t,
|
||||
sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
|
||||
{
|
||||
{0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0},
|
||||
{0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0},
|
||||
{0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||
{0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||
{0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||
{0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0},
|
||||
{0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0},
|
||||
{0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0},
|
||||
},
|
||||
};
|
||||
#endif // CONFIG_EXT_INTERP
|
||||
|
||||
// (2-2) Parallel filtering vertically to signal direction
|
||||
#if USE_TEMPORALFILTER_12TAP
|
||||
DECLARE_ALIGNED(16, const int8_t,
|
||||
sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]) = {
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127,
|
||||
-7, 127, -7, 127},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5},
|
||||
{-12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124,
|
||||
-12, 124, -12, 124},
|
||||
{18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{-17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120,
|
||||
-17, 120, -17, 120},
|
||||
{28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11},
|
||||
{6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3},
|
||||
{1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1},
|
||||
},
|
||||
{
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10},
|
||||
{-21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114,
|
||||
-21, 114, -21, 114},
|
||||
{38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15,
|
||||
38, -15, 38, -15},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11},
|
||||
{-23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107,
|
||||
-23, 107, -23, 107},
|
||||
{49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18},
|
||||
{9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
|
||||
{-25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99},
|
||||
{60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21},
|
||||
{11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
|
||||
{-25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90},
|
||||
{70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23},
|
||||
{12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
|
||||
{-24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80},
|
||||
{80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24},
|
||||
{12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
|
||||
{-23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70},
|
||||
{90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25},
|
||||
{12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11},
|
||||
{-21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60},
|
||||
{99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25},
|
||||
{12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9},
|
||||
{-18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49},
|
||||
{107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107,
|
||||
-23, 107, -23},
|
||||
{11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{-15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38},
|
||||
{114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21,
|
||||
114, -21, 114, -21},
|
||||
{10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
},
|
||||
{
|
||||
{-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1},
|
||||
{-3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6},
|
||||
{-11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28},
|
||||
{120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17,
|
||||
120, -17, 120, -17},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18},
|
||||
{124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12,
|
||||
124, -12, 124, -12},
|
||||
{5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7,
|
||||
127, -7, 127, -7},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
};
|
||||
#endif // USE_TEMPORALFILTER_12TAP
|
||||
|
||||
#if CONFIG_EXT_INTERP
|
||||
DECLARE_ALIGNED(16, const int8_t,
|
||||
sub_pel_filters_12sharp_ver_signal_dir[15][6][16]) = {
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3},
|
||||
{-7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127,
|
||||
-7, 127, -7, 127},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6},
|
||||
{-13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124,
|
||||
-13, 124, -13, 124},
|
||||
{18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{-18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120,
|
||||
-18, 120, -18, 120},
|
||||
{28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12},
|
||||
{7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10},
|
||||
{-21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115,
|
||||
-21, 115, -21, 115},
|
||||
{38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15,
|
||||
38, -15, 38, -15},
|
||||
{8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
|
||||
{-24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108,
|
||||
-24, 108, -24, 108},
|
||||
{49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18},
|
||||
{10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6},
|
||||
{3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2},
|
||||
},
|
||||
{
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
|
||||
{-25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100,
|
||||
-25, 100, -25, 100},
|
||||
{60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21},
|
||||
{11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
},
|
||||
{
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
|
||||
{-26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91},
|
||||
{71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24},
|
||||
{13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
},
|
||||
{
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
|
||||
{-25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81},
|
||||
{81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25},
|
||||
{13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
},
|
||||
{
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
|
||||
{-24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71},
|
||||
{91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26},
|
||||
{13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
},
|
||||
{
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11},
|
||||
{-21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60},
|
||||
{100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25,
|
||||
100, -25, 100, -25},
|
||||
{13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
},
|
||||
{
|
||||
{-2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3},
|
||||
{-6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10},
|
||||
{-18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49},
|
||||
{108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24,
|
||||
108, -24, 108, -24},
|
||||
{12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
|
||||
{4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
|
||||
},
|
||||
{
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8},
|
||||
{-15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38},
|
||||
{115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21,
|
||||
115, -21, 115, -21},
|
||||
{10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7},
|
||||
{-12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28},
|
||||
{120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18,
|
||||
120, -18, 120, -18},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
},
|
||||
{
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
|
||||
{-8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18},
|
||||
{124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13,
|
||||
124, -13, 124, -13},
|
||||
{6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7,
|
||||
127, -7, 127, -7},
|
||||
{3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, const int8_t,
|
||||
sub_pel_filters_10sharp_ver_signal_dir[15][6][16]) = {
|
||||
{
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
|
||||
{-6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5},
|
||||
{-12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124,
|
||||
-12, 124, -12, 124},
|
||||
{18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7},
|
||||
{3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2},
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7},
|
||||
{-17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119,
|
||||
-17, 119, -17, 119},
|
||||
{28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11},
|
||||
{5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{-20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114,
|
||||
-20, 114, -20, 114},
|
||||
{38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14},
|
||||
{7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9},
|
||||
{-22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107,
|
||||
-22, 107, -22, 107},
|
||||
{49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
|
||||
{-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
|
||||
{-24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99},
|
||||
{59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20},
|
||||
{9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4},
|
||||
{2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
|
||||
{-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
|
||||
{-24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90},
|
||||
{70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22},
|
||||
{10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
|
||||
{2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
|
||||
{-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
|
||||
{-23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80},
|
||||
{80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23},
|
||||
{10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
|
||||
{2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
|
||||
{-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
|
||||
{-22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70},
|
||||
{90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24},
|
||||
{10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
|
||||
{2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
|
||||
},
|
||||
{
|
||||
{0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
|
||||
{-4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9},
|
||||
{-20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59},
|
||||
{99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24},
|
||||
{10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
|
||||
{2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{-17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49},
|
||||
{107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22,
|
||||
107, -22, 107, -22},
|
||||
{9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7},
|
||||
{-14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38},
|
||||
{114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20,
|
||||
114, -20, 114, -20},
|
||||
{8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
|
||||
{-2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5},
|
||||
{-11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28},
|
||||
{119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17,
|
||||
119, -17, 119, -17},
|
||||
{7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{-2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3},
|
||||
{-7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18},
|
||||
{124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12,
|
||||
124, -12, 124, -12},
|
||||
{5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2},
|
||||
{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
|
||||
},
|
||||
{
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
{-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
|
||||
{-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
|
||||
{127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6,
|
||||
127, -6, 127, -6},
|
||||
{3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
|
||||
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
},
|
||||
};
|
||||
#endif // CONFIG_EXT_INTERP
|
716
vp10/common/x86/vp10_convolve_ssse3.c
Normal file
716
vp10/common/x86/vp10_convolve_ssse3.c
Normal file
@@ -0,0 +1,716 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#include "./vp10_rtcd.h"
|
||||
#include "vp10/common/filter.h"
|
||||
|
||||
#define WIDTH_BOUND (16)
|
||||
#define HEIGHT_BOUND (16)
|
||||
|
||||
static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
|
||||
__m128i t0, t1;
|
||||
|
||||
t0 = _mm_unpacklo_epi16(in[0], in[1]);
|
||||
t1 = _mm_unpacklo_epi16(in[2], in[3]);
|
||||
|
||||
out[0] = _mm_unpacklo_epi32(t0, t1);
|
||||
out[1] = _mm_srli_si128(out[0], 8);
|
||||
out[2] = _mm_unpackhi_epi32(t0, t1);
|
||||
out[3] = _mm_srli_si128(out[2], 8);
|
||||
|
||||
t0 = _mm_unpackhi_epi16(in[0], in[1]);
|
||||
t1 = _mm_unpackhi_epi16(in[2], in[3]);
|
||||
|
||||
out[4] = _mm_unpacklo_epi32(t0, t1);
|
||||
out[5] = _mm_srli_si128(out[4], 8);
|
||||
// Note: We ignore out[6] and out[7] because
|
||||
// they're zero vectors.
|
||||
}
|
||||
|
||||
typedef void (*store_pixel_t)(__m128i x, uint8_t *src, uint8_t *dst);
|
||||
|
||||
static INLINE void store_4_pixel_only(__m128i x, uint8_t *src, uint8_t *dst) {
|
||||
(void)src;
|
||||
x = _mm_packus_epi16(x, x);
|
||||
*(int *)dst = _mm_cvtsi128_si32(x);
|
||||
}
|
||||
|
||||
static INLINE __m128i accumulate_store(__m128i x, uint8_t *src) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
__m128i y = _mm_loadl_epi64((__m128i const *)src);
|
||||
y = _mm_unpacklo_epi8(y, zero);
|
||||
y = _mm_add_epi16(x, y);
|
||||
y = _mm_add_epi16(y, one);
|
||||
y = _mm_srai_epi16(y, 1);
|
||||
y = _mm_packus_epi16(y, y);
|
||||
return y;
|
||||
}
|
||||
|
||||
static INLINE void accumulate_store_4_pixel(__m128i x, uint8_t *src,
|
||||
uint8_t *dst) {
|
||||
__m128i y = accumulate_store(x, src);
|
||||
*(int *)dst = _mm_cvtsi128_si32(y);
|
||||
}
|
||||
|
||||
static store_pixel_t store4pixelTab[2] = {
|
||||
store_4_pixel_only, accumulate_store_4_pixel};
|
||||
|
||||
void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
|
||||
int tapsNum, store_pixel_t store_func, uint8_t *dst,
|
||||
uint8_t *buf) {
|
||||
__m128i sumPairRow[4];
|
||||
__m128i sumPairCol[8];
|
||||
__m128i pixel;
|
||||
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
||||
|
||||
if (10 == tapsNum) {
|
||||
src -= 1;
|
||||
}
|
||||
|
||||
pixel = _mm_loadu_si128((__m128i const *)src);
|
||||
sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
|
||||
sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
|
||||
sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
|
||||
|
||||
pixel = _mm_loadu_si128((__m128i const *)(src + 1));
|
||||
sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
|
||||
sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
|
||||
sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
|
||||
|
||||
transpose_4x8(sumPairRow, sumPairCol);
|
||||
|
||||
sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
|
||||
sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
|
||||
|
||||
sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
|
||||
sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
|
||||
|
||||
sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
|
||||
sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
|
||||
sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
|
||||
|
||||
sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
|
||||
|
||||
store_func(sumPairRow[1], dst, buf);
|
||||
}
|
||||
|
||||
void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
||||
horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
src += 4;
|
||||
buf += 4;
|
||||
horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
}
|
||||
|
||||
void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
||||
horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
src += 8;
|
||||
buf += 8;
|
||||
horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
}
|
||||
|
||||
void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
||||
horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
src += 16;
|
||||
buf += 16;
|
||||
horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
}
|
||||
|
||||
void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
||||
horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
src += 32;
|
||||
buf += 32;
|
||||
horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
}
|
||||
|
||||
void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
|
||||
store_pixel_t store, uint8_t *dst, uint8_t *buf) {
|
||||
horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
src += 64;
|
||||
buf += 64;
|
||||
horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
|
||||
}
|
||||
|
||||
static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
|
||||
store_pixel_t, uint8_t *, uint8_t *) = {
|
||||
horiz_w4_ssse3,
|
||||
horiz_w8_ssse3,
|
||||
horiz_w16_ssse3,
|
||||
horiz_w32_ssse3,
|
||||
horiz_w64_ssse3,
|
||||
horiz_w128_ssse3,
|
||||
};
|
||||
|
||||
void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum, int width,
|
||||
store_pixel_t store, uint8_t *dst, uint8_t *buffer) {
|
||||
switch (width) {
|
||||
case 2:
|
||||
case 4:
|
||||
horizTab[0](src, f, tapsNum, store, dst, buffer);
|
||||
break;
|
||||
case 8:
|
||||
horizTab[1](src, f, tapsNum, store, dst, buffer);
|
||||
break;
|
||||
case 16:
|
||||
horizTab[2](src, f, tapsNum, store, dst, buffer);
|
||||
break;
|
||||
case 32:
|
||||
horizTab[3](src, f, tapsNum, store, dst, buffer);
|
||||
break;
|
||||
case 64:
|
||||
horizTab[4](src, f, tapsNum, store, dst, buffer);
|
||||
break;
|
||||
case 128:
|
||||
horizTab[5](src, f, tapsNum, store, dst, buffer);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
// Vertical 8-pixel parallel
|
||||
typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
|
||||
uint8_t *dst, int dst_stride);
|
||||
|
||||
static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
|
||||
int src_stride,
|
||||
uint8_t *dst,
|
||||
int dst_stride) {
|
||||
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
||||
__m128i v0, v1, v2, v3;
|
||||
|
||||
__m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
|
||||
__m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
|
||||
__m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
__m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
__m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
|
||||
__m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
|
||||
__m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
|
||||
__m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
|
||||
|
||||
u0 = _mm_mulhrs_epi16(u0, k_256);
|
||||
u1 = _mm_mulhrs_epi16(u1, k_256);
|
||||
u2 = _mm_mulhrs_epi16(u2, k_256);
|
||||
u3 = _mm_mulhrs_epi16(u3, k_256);
|
||||
u4 = _mm_mulhrs_epi16(u4, k_256);
|
||||
u5 = _mm_mulhrs_epi16(u5, k_256);
|
||||
u6 = _mm_mulhrs_epi16(u6, k_256);
|
||||
u7 = _mm_mulhrs_epi16(u7, k_256);
|
||||
|
||||
v0 = _mm_packus_epi16(u0, u1);
|
||||
v1 = _mm_packus_epi16(u2, u3);
|
||||
v2 = _mm_packus_epi16(u4, u5);
|
||||
v3 = _mm_packus_epi16(u6, u7);
|
||||
|
||||
u0 = _mm_unpacklo_epi8(v0, v1);
|
||||
u1 = _mm_unpackhi_epi8(v0, v1);
|
||||
u2 = _mm_unpacklo_epi8(v2, v3);
|
||||
u3 = _mm_unpackhi_epi8(v2, v3);
|
||||
|
||||
u4 = _mm_unpacklo_epi8(u0, u1);
|
||||
u5 = _mm_unpacklo_epi8(u2, u3);
|
||||
u6 = _mm_unpackhi_epi8(u0, u1);
|
||||
u7 = _mm_unpackhi_epi8(u2, u3);
|
||||
|
||||
u0 = _mm_unpacklo_epi32(u4, u5);
|
||||
u1 = _mm_unpackhi_epi32(u4, u5);
|
||||
u2 = _mm_unpacklo_epi32(u6, u7);
|
||||
u3 = _mm_unpackhi_epi32(u6, u7);
|
||||
|
||||
u4 = _mm_srli_si128(u0, 8);
|
||||
u5 = _mm_srli_si128(u1, 8);
|
||||
u6 = _mm_srli_si128(u2, 8);
|
||||
u7 = _mm_srli_si128(u3, 8);
|
||||
|
||||
_mm_storel_epi64((__m128i*)dst, u0);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 1), u4);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 2), u1);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 3), u5);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 4), u2);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 5), u6);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 6), u3);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 7), u7);
|
||||
}
|
||||
|
||||
static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
|
||||
int src_stride,
|
||||
uint8_t *dst,
|
||||
int dst_stride) {
|
||||
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
|
||||
__m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
|
||||
__m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
|
||||
__m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
__m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
__m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
|
||||
__m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
|
||||
__m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
|
||||
__m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
|
||||
|
||||
u0 = _mm_mulhrs_epi16(u0, k_256);
|
||||
u1 = _mm_mulhrs_epi16(u1, k_256);
|
||||
u2 = _mm_mulhrs_epi16(u2, k_256);
|
||||
u3 = _mm_mulhrs_epi16(u3, k_256);
|
||||
u4 = _mm_mulhrs_epi16(u4, k_256);
|
||||
u5 = _mm_mulhrs_epi16(u5, k_256);
|
||||
u6 = _mm_mulhrs_epi16(u6, k_256);
|
||||
u7 = _mm_mulhrs_epi16(u7, k_256);
|
||||
|
||||
v0 = _mm_packus_epi16(u0, u1);
|
||||
v1 = _mm_packus_epi16(u2, u3);
|
||||
v2 = _mm_packus_epi16(u4, u5);
|
||||
v3 = _mm_packus_epi16(u6, u7);
|
||||
|
||||
u0 = _mm_unpacklo_epi8(v0, v1);
|
||||
u1 = _mm_unpackhi_epi8(v0, v1);
|
||||
u2 = _mm_unpacklo_epi8(v2, v3);
|
||||
u3 = _mm_unpackhi_epi8(v2, v3);
|
||||
|
||||
u4 = _mm_unpacklo_epi8(u0, u1);
|
||||
u5 = _mm_unpacklo_epi8(u2, u3);
|
||||
u6 = _mm_unpackhi_epi8(u0, u1);
|
||||
u7 = _mm_unpackhi_epi8(u2, u3);
|
||||
|
||||
u0 = _mm_unpacklo_epi32(u4, u5);
|
||||
u1 = _mm_unpackhi_epi32(u4, u5);
|
||||
u2 = _mm_unpacklo_epi32(u6, u7);
|
||||
u3 = _mm_unpackhi_epi32(u6, u7);
|
||||
|
||||
u4 = _mm_srli_si128(u0, 8);
|
||||
u5 = _mm_srli_si128(u1, 8);
|
||||
u6 = _mm_srli_si128(u2, 8);
|
||||
u7 = _mm_srli_si128(u3, 8);
|
||||
|
||||
v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
|
||||
v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
|
||||
v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
|
||||
v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
|
||||
v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
|
||||
v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
|
||||
v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
|
||||
v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
|
||||
|
||||
u0 = _mm_unpacklo_epi8(u0, zero);
|
||||
u1 = _mm_unpacklo_epi8(u1, zero);
|
||||
u2 = _mm_unpacklo_epi8(u2, zero);
|
||||
u3 = _mm_unpacklo_epi8(u3, zero);
|
||||
u4 = _mm_unpacklo_epi8(u4, zero);
|
||||
u5 = _mm_unpacklo_epi8(u5, zero);
|
||||
u6 = _mm_unpacklo_epi8(u6, zero);
|
||||
u7 = _mm_unpacklo_epi8(u7, zero);
|
||||
|
||||
v0 = _mm_unpacklo_epi8(v0, zero);
|
||||
v1 = _mm_unpacklo_epi8(v1, zero);
|
||||
v2 = _mm_unpacklo_epi8(v2, zero);
|
||||
v3 = _mm_unpacklo_epi8(v3, zero);
|
||||
v4 = _mm_unpacklo_epi8(v4, zero);
|
||||
v5 = _mm_unpacklo_epi8(v5, zero);
|
||||
v6 = _mm_unpacklo_epi8(v6, zero);
|
||||
v7 = _mm_unpacklo_epi8(v7, zero);
|
||||
|
||||
v0 = _mm_adds_epi16(u0, v0);
|
||||
v1 = _mm_adds_epi16(u4, v1);
|
||||
v2 = _mm_adds_epi16(u1, v2);
|
||||
v3 = _mm_adds_epi16(u5, v3);
|
||||
v4 = _mm_adds_epi16(u2, v4);
|
||||
v5 = _mm_adds_epi16(u6, v5);
|
||||
v6 = _mm_adds_epi16(u3, v6);
|
||||
v7 = _mm_adds_epi16(u7, v7);
|
||||
|
||||
v0 = _mm_adds_epi16(v0, one);
|
||||
v1 = _mm_adds_epi16(v1, one);
|
||||
v2 = _mm_adds_epi16(v2, one);
|
||||
v3 = _mm_adds_epi16(v3, one);
|
||||
v4 = _mm_adds_epi16(v4, one);
|
||||
v5 = _mm_adds_epi16(v5, one);
|
||||
v6 = _mm_adds_epi16(v6, one);
|
||||
v7 = _mm_adds_epi16(v7, one);
|
||||
|
||||
v0 = _mm_srai_epi16(v0, 1);
|
||||
v1 = _mm_srai_epi16(v1, 1);
|
||||
v2 = _mm_srai_epi16(v2, 1);
|
||||
v3 = _mm_srai_epi16(v3, 1);
|
||||
v4 = _mm_srai_epi16(v4, 1);
|
||||
v5 = _mm_srai_epi16(v5, 1);
|
||||
v6 = _mm_srai_epi16(v6, 1);
|
||||
v7 = _mm_srai_epi16(v7, 1);
|
||||
|
||||
u0 = _mm_packus_epi16(v0, v1);
|
||||
u1 = _mm_packus_epi16(v2, v3);
|
||||
u2 = _mm_packus_epi16(v4, v5);
|
||||
u3 = _mm_packus_epi16(v6, v7);
|
||||
|
||||
u4 = _mm_srli_si128(u0, 8);
|
||||
u5 = _mm_srli_si128(u1, 8);
|
||||
u6 = _mm_srli_si128(u2, 8);
|
||||
u7 = _mm_srli_si128(u3, 8);
|
||||
|
||||
_mm_storel_epi64((__m128i*)dst, u0);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 1), u4);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 2), u1);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 3), u5);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 4), u2);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 5), u6);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 6), u3);
|
||||
_mm_storel_epi64((__m128i*)(dst + dst_stride * 7), u7);
|
||||
}
|
||||
|
||||
static transpose_to_dst_t trans8x8Tab[2] = {
|
||||
transpose8x8_direct_to_dst, transpose8x8_accumu_to_dst
|
||||
};
|
||||
|
||||
static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
|
||||
__m128i t0, t1, t2, t3, u0, u1;
|
||||
|
||||
t0 = _mm_unpacklo_epi16(in[0], in[1]);
|
||||
t1 = _mm_unpacklo_epi16(in[2], in[3]);
|
||||
t2 = _mm_unpacklo_epi16(in[4], in[5]);
|
||||
t3 = _mm_unpacklo_epi16(in[6], in[7]);
|
||||
|
||||
u0 = _mm_unpacklo_epi32(t0, t1);
|
||||
u1 = _mm_unpacklo_epi32(t2, t3);
|
||||
|
||||
out[0] = _mm_unpacklo_epi64(u0, u1);
|
||||
out[1] = _mm_unpackhi_epi64(u0, u1);
|
||||
|
||||
u0 = _mm_unpackhi_epi32(t0, t1);
|
||||
u1 = _mm_unpackhi_epi32(t2, t3);
|
||||
|
||||
out[2] = _mm_unpacklo_epi64(u0, u1);
|
||||
out[3] = _mm_unpackhi_epi64(u0, u1);
|
||||
|
||||
t0 = _mm_unpackhi_epi16(in[0], in[1]);
|
||||
t1 = _mm_unpackhi_epi16(in[2], in[3]);
|
||||
t2 = _mm_unpackhi_epi16(in[4], in[5]);
|
||||
t3 = _mm_unpackhi_epi16(in[6], in[7]);
|
||||
|
||||
u0 = _mm_unpacklo_epi32(t0, t1);
|
||||
u1 = _mm_unpacklo_epi32(t2, t3);
|
||||
|
||||
out[4] = _mm_unpacklo_epi64(u0, u1);
|
||||
out[5] = _mm_unpackhi_epi64(u0, u1);
|
||||
|
||||
// Ignore out[6] and out[7]
|
||||
// they're zero vectors.
|
||||
}
|
||||
|
||||
static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
|
||||
__m128i *f, int tapsNum, uint16_t *buf) {
|
||||
__m128i s[8], t[6];
|
||||
__m128i min_x2x3, max_x2x3;
|
||||
__m128i temp;
|
||||
|
||||
if (tapsNum == 10) {
|
||||
src_ptr -= 1;
|
||||
}
|
||||
s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
|
||||
s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
|
||||
s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
|
||||
s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
|
||||
s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
|
||||
s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
|
||||
s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
|
||||
s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
|
||||
|
||||
// TRANSPOSE...
|
||||
// Vecotor represents column pixel pairs instead of a row
|
||||
transpose_8x16(s, t);
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
s[0] = _mm_maddubs_epi16(t[0], f[0]);
|
||||
s[1] = _mm_maddubs_epi16(t[1], f[1]);
|
||||
s[2] = _mm_maddubs_epi16(t[2], f[2]);
|
||||
s[3] = _mm_maddubs_epi16(t[3], f[3]);
|
||||
s[4] = _mm_maddubs_epi16(t[4], f[4]);
|
||||
s[5] = _mm_maddubs_epi16(t[5], f[5]);
|
||||
|
||||
// add and saturate the results together
|
||||
min_x2x3 = _mm_min_epi16(s[2], s[3]);
|
||||
max_x2x3 = _mm_max_epi16(s[2], s[3]);
|
||||
temp = _mm_adds_epi16(s[0], s[1]);
|
||||
temp = _mm_adds_epi16(temp, s[5]);
|
||||
temp = _mm_adds_epi16(temp, s[4]);
|
||||
|
||||
temp = _mm_adds_epi16(temp, min_x2x3);
|
||||
temp = _mm_adds_epi16(temp, max_x2x3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)buf, temp);
|
||||
}
|
||||
|
||||
// Vertical 4-pixel parallel
|
||||
static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
|
||||
int src_stride,
|
||||
uint8_t *dst,
|
||||
int dst_stride) {
|
||||
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
||||
__m128i v0, v1, v2, v3;
|
||||
|
||||
// TODO(luoyi): two loads, 8 elements per load (two bytes per element)
|
||||
__m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
|
||||
__m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
|
||||
__m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
|
||||
__m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
|
||||
|
||||
v0 = _mm_unpacklo_epi16(u0, u1);
|
||||
v1 = _mm_unpacklo_epi16(u2, u3);
|
||||
|
||||
v2 = _mm_unpacklo_epi32(v0, v1);
|
||||
v3 = _mm_unpackhi_epi32(v0, v1);
|
||||
|
||||
u0 = _mm_mulhrs_epi16(v2, k_256);
|
||||
u1 = _mm_mulhrs_epi16(v3, k_256);
|
||||
|
||||
u0 = _mm_packus_epi16(u0, u1);
|
||||
u1 = _mm_srli_si128(u0, 4);
|
||||
u2 = _mm_srli_si128(u0, 8);
|
||||
u3 = _mm_srli_si128(u0, 12);
|
||||
|
||||
*(int *)(dst) = _mm_cvtsi128_si32(u0);
|
||||
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
|
||||
*(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
|
||||
*(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
|
||||
}
|
||||
|
||||
static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
|
||||
int src_stride,
|
||||
uint8_t *dst,
|
||||
int dst_stride) {
|
||||
const __m128i k_256 = _mm_set1_epi16(1 << 8);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
|
||||
__m128i v0, v1, v2, v3;
|
||||
|
||||
__m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
|
||||
__m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
|
||||
__m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
|
||||
__m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
|
||||
|
||||
v0 = _mm_unpacklo_epi16(u0, u1);
|
||||
v1 = _mm_unpacklo_epi16(u2, u3);
|
||||
|
||||
v2 = _mm_unpacklo_epi32(v0, v1);
|
||||
v3 = _mm_unpackhi_epi32(v0, v1);
|
||||
|
||||
u0 = _mm_mulhrs_epi16(v2, k_256);
|
||||
u1 = _mm_mulhrs_epi16(v3, k_256);
|
||||
|
||||
u2 = _mm_packus_epi16(u0, u1);
|
||||
u0 = _mm_unpacklo_epi8(u2, zero);
|
||||
u1 = _mm_unpackhi_epi8(u2, zero);
|
||||
|
||||
// load pixel values
|
||||
v0 = _mm_loadl_epi64((__m128i const *)(dst));
|
||||
v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
|
||||
v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
|
||||
v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
|
||||
|
||||
v0 = _mm_unpacklo_epi8(v0, zero);
|
||||
v1 = _mm_unpacklo_epi8(v1, zero);
|
||||
v2 = _mm_unpacklo_epi8(v2, zero);
|
||||
v3 = _mm_unpacklo_epi8(v3, zero);
|
||||
|
||||
v0 = _mm_unpacklo_epi64(v0, v1);
|
||||
v1 = _mm_unpacklo_epi64(v2, v3);
|
||||
|
||||
u0 = _mm_adds_epi16(u0, v0);
|
||||
u1 = _mm_adds_epi16(u1, v1);
|
||||
|
||||
u0 = _mm_adds_epi16(u0, one);
|
||||
u1 = _mm_adds_epi16(u1, one);
|
||||
|
||||
u0 = _mm_srai_epi16(u0, 1);
|
||||
u1 = _mm_srai_epi16(u1, 1);
|
||||
|
||||
// saturation and pack to pixels
|
||||
u0 = _mm_packus_epi16(u0, u1);
|
||||
u1 = _mm_srli_si128(u0, 4);
|
||||
u2 = _mm_srli_si128(u0, 8);
|
||||
u3 = _mm_srli_si128(u0, 12);
|
||||
|
||||
*(int *)(dst) = _mm_cvtsi128_si32(u0);
|
||||
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
|
||||
*(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
|
||||
*(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
|
||||
}
|
||||
|
||||
static transpose_to_dst_t trans4x4Tab[2] = {
|
||||
transpose4x4_direct_to_dst, transpose4x4_accumu_to_dst
|
||||
};
|
||||
|
||||
static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
|
||||
__m128i *f, int tapsNum, uint16_t *buf) {
|
||||
__m128i A, B, C, D;
|
||||
__m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
|
||||
__m128i x0, x1, x2, x3, x4, x5;
|
||||
__m128i min_x2x3, max_x2x3, temp;
|
||||
|
||||
if (tapsNum == 10) {
|
||||
src_ptr -= 1;
|
||||
}
|
||||
A = _mm_loadu_si128((const __m128i *)src_ptr);
|
||||
B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
|
||||
C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
|
||||
D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
|
||||
|
||||
// TRANSPOSE...
|
||||
// Vecotor represents column pixel pairs instead of a row
|
||||
// 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
|
||||
tr0_0 = _mm_unpacklo_epi16(A, B);
|
||||
// 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
|
||||
tr0_1 = _mm_unpacklo_epi16(C, D);
|
||||
// 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
|
||||
s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
|
||||
// 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
|
||||
s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
|
||||
// 02 03 12 13 22 23 32 33
|
||||
s3s2 = _mm_srli_si128(s1s0, 8);
|
||||
// 06 07 16 17 26 27 36 37
|
||||
s7s6 = _mm_srli_si128(s5s4, 8);
|
||||
|
||||
tr0_0 = _mm_unpackhi_epi16(A, B);
|
||||
tr0_1 = _mm_unpackhi_epi16(C, D);
|
||||
s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
|
||||
sbsa = _mm_srli_si128(s9s8, 8);
|
||||
|
||||
// multiply 2 adjacent elements with the filter and add the result
|
||||
x0 = _mm_maddubs_epi16(s1s0, f[0]);
|
||||
x1 = _mm_maddubs_epi16(s3s2, f[1]);
|
||||
x2 = _mm_maddubs_epi16(s5s4, f[2]);
|
||||
x3 = _mm_maddubs_epi16(s7s6, f[3]);
|
||||
x4 = _mm_maddubs_epi16(s9s8, f[4]);
|
||||
x5 = _mm_maddubs_epi16(sbsa, f[5]);
|
||||
// add and saturate the results together
|
||||
min_x2x3 = _mm_min_epi16(x2, x3);
|
||||
max_x2x3 = _mm_max_epi16(x2, x3);
|
||||
temp = _mm_adds_epi16(x0, x1);
|
||||
temp = _mm_adds_epi16(temp, x5);
|
||||
temp = _mm_adds_epi16(temp, x4);
|
||||
|
||||
temp = _mm_adds_epi16(temp, min_x2x3);
|
||||
temp = _mm_adds_epi16(temp, max_x2x3);
|
||||
_mm_storel_epi64((__m128i *)buf, temp);
|
||||
}
|
||||
|
||||
// Note:
|
||||
// This function assumes:
|
||||
// (1) 10/12-taps filters
|
||||
// (2) x_step_q4 = 16 then filter is fixed at the call
|
||||
|
||||
void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
int dst_stride, int w, int h,
|
||||
const InterpFilterParams filter_params,
|
||||
const int subpel_x_q4, int x_step_q4, int avg) {
|
||||
DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
|
||||
__m128i verf[6];
|
||||
__m128i horf[2];
|
||||
SubpelFilterCoeffs hCoeffs, vCoeffs;
|
||||
const uint8_t *src_ptr;
|
||||
store_pixel_t store4p = store4pixelTab[avg];
|
||||
transpose_to_dst_t transpose_4x4 = trans4x4Tab[avg];
|
||||
transpose_to_dst_t transpose_8x8 = trans8x8Tab[avg];
|
||||
|
||||
const int tapsNum = filter_params.taps;
|
||||
int block_height, block_residu;
|
||||
int i, col, count;
|
||||
(void)x_step_q4;
|
||||
|
||||
if (0 == subpel_x_q4 || 16 != x_step_q4) {
|
||||
vp10_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_x_q4, x_step_q4, avg);
|
||||
return;
|
||||
}
|
||||
|
||||
hCoeffs = vp10_get_subpel_filter_signal_dir(
|
||||
filter_params, subpel_x_q4 - 1);
|
||||
vCoeffs = vp10_get_subpel_filter_ver_signal_dir(
|
||||
filter_params, subpel_x_q4 - 1);
|
||||
|
||||
if (!hCoeffs || !vCoeffs) {
|
||||
vp10_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_x_q4, x_step_q4, avg);
|
||||
return;
|
||||
}
|
||||
|
||||
verf[0] = *((const __m128i *)(vCoeffs));
|
||||
verf[1] = *((const __m128i *)(vCoeffs + 1));
|
||||
verf[2] = *((const __m128i *)(vCoeffs + 2));
|
||||
verf[3] = *((const __m128i *)(vCoeffs + 3));
|
||||
verf[4] = *((const __m128i *)(vCoeffs + 4));
|
||||
verf[5] = *((const __m128i *)(vCoeffs + 5));
|
||||
|
||||
horf[0] = *((const __m128i *)(hCoeffs));
|
||||
horf[1] = *((const __m128i *)(hCoeffs + 1));
|
||||
|
||||
count = 0;
|
||||
|
||||
// here tapsNum is filter size
|
||||
src -= (tapsNum >> 1) - 1;
|
||||
src_ptr = src;
|
||||
if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
|
||||
// 8-pixels parallel
|
||||
block_height = h >> 3;
|
||||
block_residu = h & 7;
|
||||
|
||||
do {
|
||||
for (col = 0; col < w; col += 8) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
|
||||
temp + (i * 8));
|
||||
src_ptr += 1;
|
||||
}
|
||||
transpose_8x8(temp, 8, dst + col, dst_stride);
|
||||
}
|
||||
count++;
|
||||
src_ptr = src + count * src_stride * 8;
|
||||
dst += dst_stride * 8;
|
||||
} while (count < block_height);
|
||||
|
||||
for (i = 0; i < block_residu; ++i) {
|
||||
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
|
||||
src_ptr += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
} else {
|
||||
// 4-pixels parallel
|
||||
block_height = h >> 2;
|
||||
block_residu = h & 3;
|
||||
|
||||
do {
|
||||
for (col = 0; col < w; col += 4) {
|
||||
for (i = 0; i < 4; ++i) {
|
||||
filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
|
||||
temp + (i * 4));
|
||||
src_ptr += 1;
|
||||
}
|
||||
transpose_4x4(temp, 4, dst + col, dst_stride);
|
||||
}
|
||||
count++;
|
||||
src_ptr = src + count * src_stride * 4;
|
||||
dst += dst_stride * 4;
|
||||
} while (count < block_height);
|
||||
|
||||
for (i = 0; i < block_residu; ++i) {
|
||||
filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
|
||||
src_ptr += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
}
|
@@ -72,6 +72,8 @@ VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d.c
|
||||
VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d_cfg.h
|
||||
VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.c
|
||||
VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
|
||||
VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_ssse3.c
|
||||
VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_filters_ssse3.c
|
||||
VP10_COMMON_SRCS-yes += common/vp10_convolve.c
|
||||
VP10_COMMON_SRCS-yes += common/vp10_convolve.h
|
||||
VP10_COMMON_SRCS-$(CONFIG_ANS) += common/ans.h
|
||||
|
Reference in New Issue
Block a user