HBD convolution filtering (10/12 taps) SSE4.1 optimization
- For experiment EXT_INTERP under high bit depth. - Add unit test to verify bit-exact. - Speed performance improvement: On Xeon E5-2680, park_joy_1080p_12.y4m, 50 frames, encoding time drops from 6682503 ms to 5390270 ms. Change-Id: Iea4debf5414f3accf1eb5672abeab56a0539ac77
This commit is contained in:
@@ -24,12 +24,25 @@ using libvpx_test::ACMRandom;
|
||||
typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
|
||||
int, int, const InterpFilterParams,
|
||||
const int, int, int);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
typedef void (*hbd_conv_filter_t)(const uint16_t*, int, uint16_t*, int,
|
||||
int, int, const InterpFilterParams,
|
||||
const int, int, int, int);
|
||||
#endif
|
||||
|
||||
// Test parameter list:
|
||||
// <convolve_horiz_func, convolve_vert_func,
|
||||
// <width, height>, filter_params, subpel_x_q4, avg>
|
||||
typedef tuple<int, int> BlockDimension;
|
||||
typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
|
||||
int, int> ConvParams;
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
// Test parameter list:
|
||||
// <convolve_horiz_func, convolve_vert_func,
|
||||
// <width, height>, filter_params, subpel_x_q4, avg, bit_dpeth>
|
||||
typedef tuple<hbd_conv_filter_t, hbd_conv_filter_t, BlockDimension,
|
||||
INTERP_FILTER, int, int, int> HbdConvParams;
|
||||
#endif
|
||||
|
||||
// Note:
|
||||
// src_ and src_ref_ have special boundary requirement
|
||||
@@ -75,11 +88,8 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
|
||||
void RunVertFilterBitExactCheck();
|
||||
|
||||
private:
|
||||
void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
|
||||
uint8_t *dst, uint8_t *dst_ref,
|
||||
int w, int h);
|
||||
void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
|
||||
int w, int h, int fgroup, int findex);
|
||||
void PrepFilterBuffer(int w, int h);
|
||||
void DiffFilterBuffer();
|
||||
conv_filter_t conv_horiz_;
|
||||
conv_filter_t conv_vert_;
|
||||
uint8_t *alloc_;
|
||||
@@ -94,18 +104,16 @@ class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
|
||||
int avg_;
|
||||
};
|
||||
|
||||
void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
|
||||
uint8_t *dst, uint8_t *dst_ref,
|
||||
int w, int h) {
|
||||
void VP10ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
|
||||
int r, c;
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
|
||||
memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
|
||||
|
||||
uint8_t *src_ptr = src;
|
||||
uint8_t *dst_ptr = dst;
|
||||
uint8_t *src_ref_ptr = src_ref;
|
||||
uint8_t *dst_ref_ptr = dst_ref;
|
||||
uint8_t *src_ptr = src_;
|
||||
uint8_t *dst_ptr = dst_;
|
||||
uint8_t *src_ref_ptr = src_ref_;
|
||||
uint8_t *dst_ref_ptr = dst_ref_;
|
||||
|
||||
for (r = 0; r < height_; ++r) {
|
||||
for (c = 0; c < width_; ++c) {
|
||||
@@ -121,21 +129,17 @@ void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
|
||||
}
|
||||
}
|
||||
|
||||
void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
|
||||
const uint8_t *buf_ref,
|
||||
int w, int h,
|
||||
int filter_group,
|
||||
int filter_index) {
|
||||
void VP10ConvolveOptimzTest::DiffFilterBuffer() {
|
||||
int r, c;
|
||||
const uint8_t *dst_ptr = buf;
|
||||
const uint8_t *dst_ref_ptr = buf_ref;
|
||||
for (r = 0; r < h; ++r) {
|
||||
for (c = 0; c < w; ++c) {
|
||||
const uint8_t *dst_ptr = dst_;
|
||||
const uint8_t *dst_ref_ptr = dst_ref_;
|
||||
for (r = 0; r < height_; ++r) {
|
||||
for (c = 0; c < width_; ++c) {
|
||||
EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
|
||||
<< "Error at row: " << r << " col: " << c << " "
|
||||
<< "w = " << w << " " << "h = " << h << " "
|
||||
<< "filter group index = " << filter_group << " "
|
||||
<< "filter index = " << filter_index;
|
||||
<< "w = " << width_ << " " << "h = " << height_ << " "
|
||||
<< "filter group index = " << filter_ << " "
|
||||
<< "filter index = " << subpel_;
|
||||
}
|
||||
dst_ptr += stride;
|
||||
dst_ref_ptr += stride;
|
||||
@@ -143,7 +147,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
|
||||
}
|
||||
|
||||
void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
|
||||
PrepFilterBuffer(testMaxBlk, testMaxBlk);
|
||||
|
||||
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
||||
|
||||
@@ -153,14 +157,14 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||
conv_horiz_(src_, stride, dst_, stride, width_, height_,
|
||||
filter_params, subpel_, x_step_q4, avg_);
|
||||
|
||||
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
|
||||
DiffFilterBuffer();
|
||||
|
||||
// Note:
|
||||
// Here we need calculate a height which is different from the specified one
|
||||
// and test again.
|
||||
int intermediate_height =
|
||||
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
|
||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
|
||||
PrepFilterBuffer(testMaxBlk, testMaxBlk);
|
||||
|
||||
vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
|
||||
intermediate_height, filter_params, subpel_, x_step_q4,
|
||||
@@ -170,12 +174,11 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||
intermediate_height, filter_params, subpel_, x_step_q4,
|
||||
avg_);
|
||||
|
||||
DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
|
||||
subpel_);
|
||||
DiffFilterBuffer();
|
||||
}
|
||||
|
||||
void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
|
||||
PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
|
||||
PrepFilterBuffer(testMaxBlk, testMaxBlk);
|
||||
|
||||
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
||||
|
||||
@@ -185,7 +188,7 @@ void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
|
||||
conv_vert_(src_, stride, dst_, stride, width_, height_,
|
||||
filter_params, subpel_, x_step_q4, avg_);
|
||||
|
||||
DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
|
||||
DiffFilterBuffer();
|
||||
}
|
||||
|
||||
TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
|
||||
@@ -197,7 +200,7 @@ TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) {
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_EXT_INTERP
|
||||
#if (HAVE_SSSE3 || HAVE_SSE4_1) && CONFIG_EXT_INTERP
|
||||
const BlockDimension kBlockDim[] = {
|
||||
make_tuple(2, 2),
|
||||
make_tuple(2, 4),
|
||||
@@ -225,7 +228,9 @@ const INTERP_FILTER kFilter[] = {6, 4, 2};
|
||||
const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
|
||||
const int kAvg[] = {0, 1};
|
||||
#endif
|
||||
|
||||
#if HAVE_SSSE3 && CONFIG_EXT_INTERP
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSSE3, VP10ConvolveOptimzTest,
|
||||
::testing::Combine(
|
||||
@@ -236,4 +241,167 @@ INSTANTIATE_TEST_CASE_P(
|
||||
::testing::ValuesIn(kSubpelQ4),
|
||||
::testing::ValuesIn(kAvg)));
|
||||
#endif // HAVE_SSSE3 && CONFIG_EXT_INTERP
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
typedef ::testing::TestWithParam<HbdConvParams> TestWithHbdConvParams;
|
||||
class VP10HbdConvolveOptimzTest : public TestWithHbdConvParams {
|
||||
public:
|
||||
virtual ~VP10HbdConvolveOptimzTest() {}
|
||||
virtual void SetUp() {
|
||||
conv_horiz_ = GET_PARAM(0);
|
||||
conv_vert_ = GET_PARAM(1);
|
||||
BlockDimension block = GET_PARAM(2);
|
||||
width_ = std::tr1::get<0>(block);
|
||||
height_ = std::tr1::get<1>(block);
|
||||
filter_ = GET_PARAM(3);
|
||||
subpel_ = GET_PARAM(4);
|
||||
avg_ = GET_PARAM(5);
|
||||
bit_depth_ = GET_PARAM(6);
|
||||
|
||||
alloc_ = new uint16_t[maxBlockSize * 4];
|
||||
src_ = alloc_ + (vertiOffset * maxWidth);
|
||||
src_ += horizOffset;
|
||||
src_ref_ = src_ + maxBlockSize;
|
||||
|
||||
dst_ = alloc_ + 2 * maxBlockSize;
|
||||
dst_ref_ = alloc_ + 3 * maxBlockSize;
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
delete[] alloc_;
|
||||
libvpx_test::ClearSystemState();
|
||||
}
|
||||
|
||||
protected:
|
||||
void RunHorizFilterBitExactCheck();
|
||||
void RunVertFilterBitExactCheck();
|
||||
|
||||
private:
|
||||
void PrepFilterBuffer(int w, int h);
|
||||
void DiffFilterBuffer();
|
||||
hbd_conv_filter_t conv_horiz_;
|
||||
hbd_conv_filter_t conv_vert_;
|
||||
uint16_t *alloc_;
|
||||
uint16_t *src_;
|
||||
uint16_t *dst_;
|
||||
uint16_t *src_ref_;
|
||||
uint16_t *dst_ref_;
|
||||
int width_;
|
||||
int height_;
|
||||
int filter_;
|
||||
int subpel_;
|
||||
int avg_;
|
||||
int bit_depth_;
|
||||
};
|
||||
|
||||
void VP10HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
|
||||
int r, c;
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
|
||||
memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
|
||||
|
||||
uint16_t *src_ptr = src_;
|
||||
uint16_t *dst_ptr = dst_;
|
||||
uint16_t *dst_ref_ptr = dst_ref_;
|
||||
uint16_t hbd_mask = (1 << bit_depth_) - 1;
|
||||
|
||||
for (r = 0; r < height_; ++r) {
|
||||
for (c = 0; c < width_; ++c) {
|
||||
src_ptr[c] = rnd.Rand16() & hbd_mask;
|
||||
dst_ptr[c] = rnd.Rand16() & hbd_mask;
|
||||
dst_ref_ptr[c] = dst_ptr[c];
|
||||
}
|
||||
src_ptr += stride;
|
||||
dst_ptr += stride;
|
||||
dst_ref_ptr += stride;
|
||||
}
|
||||
}
|
||||
|
||||
void VP10HbdConvolveOptimzTest::DiffFilterBuffer() {
|
||||
int r, c;
|
||||
const uint16_t *dst_ptr = dst_;
|
||||
const uint16_t *dst_ref_ptr = dst_ref_;
|
||||
for (r = 0; r < height_; ++r) {
|
||||
for (c = 0; c < width_; ++c) {
|
||||
EXPECT_EQ((uint16_t)dst_ref_ptr[c], (uint16_t)dst_ptr[c])
|
||||
<< "Error at row: " << r << " col: " << c << " "
|
||||
<< "w = " << width_ << " " << "h = " << height_ << " "
|
||||
<< "filter group index = " << filter_ << " "
|
||||
<< "filter index = " << subpel_ << " "
|
||||
<< "bit depth = " << bit_depth_;
|
||||
}
|
||||
dst_ptr += stride;
|
||||
dst_ref_ptr += stride;
|
||||
}
|
||||
}
|
||||
|
||||
void VP10HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
|
||||
PrepFilterBuffer(testMaxBlk, testMaxBlk);
|
||||
|
||||
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
||||
|
||||
vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
|
||||
height_, filter_params, subpel_, x_step_q4,
|
||||
avg_, bit_depth_);
|
||||
|
||||
conv_horiz_(src_, stride, dst_, stride, width_, height_,
|
||||
filter_params, subpel_, x_step_q4, avg_, bit_depth_);
|
||||
|
||||
DiffFilterBuffer();
|
||||
|
||||
// Note:
|
||||
// Here we need calculate a height which is different from the specified one
|
||||
// and test again.
|
||||
int intermediate_height =
|
||||
(((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
|
||||
PrepFilterBuffer(testMaxBlk, testMaxBlk);
|
||||
|
||||
vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
|
||||
intermediate_height, filter_params, subpel_,
|
||||
x_step_q4, avg_, bit_depth_);
|
||||
|
||||
conv_horiz_(src_, stride, dst_, stride, width_, intermediate_height,
|
||||
filter_params, subpel_, x_step_q4, avg_, bit_depth_);
|
||||
|
||||
DiffFilterBuffer();
|
||||
}
|
||||
|
||||
void VP10HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
|
||||
PrepFilterBuffer(testMaxBlk, testMaxBlk);
|
||||
|
||||
InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
|
||||
|
||||
vp10_highbd_convolve_vert_c(src_, stride, dst_ref_, stride, width_, height_,
|
||||
filter_params, subpel_, x_step_q4, avg_,
|
||||
bit_depth_);
|
||||
|
||||
conv_vert_(src_, stride, dst_, stride, width_, height_,
|
||||
filter_params, subpel_, x_step_q4, avg_, bit_depth_);
|
||||
|
||||
DiffFilterBuffer();
|
||||
}
|
||||
|
||||
TEST_P(VP10HbdConvolveOptimzTest, HorizBitExactCheck) {
|
||||
RunHorizFilterBitExactCheck();
|
||||
}
|
||||
TEST_P(VP10HbdConvolveOptimzTest, VertBitExactCheck) {
|
||||
RunVertFilterBitExactCheck();
|
||||
}
|
||||
|
||||
#if HAVE_SSE4_1 && CONFIG_EXT_INTERP
|
||||
|
||||
const int kBitdepth[] = {10, 12};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE4_1, VP10HbdConvolveOptimzTest,
|
||||
::testing::Combine(
|
||||
::testing::Values(vp10_highbd_convolve_horiz_sse4_1),
|
||||
::testing::Values(vp10_highbd_convolve_vert_sse4_1),
|
||||
::testing::ValuesIn(kBlockDim),
|
||||
::testing::ValuesIn(kFilter),
|
||||
::testing::ValuesIn(kSubpelQ4),
|
||||
::testing::ValuesIn(kAvg),
|
||||
::testing::ValuesIn(kBitdepth)));
|
||||
#endif // HAVE_SSE4_1 && CONFIG_EXT_INTERP
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
} // namespace
|
||||
|
@@ -342,3 +342,25 @@ SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
|
||||
(void)index;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
|
||||
const InterpFilterParams p, int index) {
|
||||
#if CONFIG_EXT_INTERP && HAVE_SSE4_1
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
|
||||
return &sub_pel_filters_12sharp_highbd_ver_signal_dir[index][0];
|
||||
}
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
|
||||
return &sub_pel_filters_10sharp_highbd_ver_signal_dir[index][0];
|
||||
}
|
||||
#endif
|
||||
#if USE_TEMPORALFILTER_12TAP && HAVE_SSE4_1
|
||||
if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
|
||||
return &sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[index][0];
|
||||
}
|
||||
#endif
|
||||
(void)p;
|
||||
(void)index;
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
@@ -95,6 +95,10 @@ static INLINE int vp10_is_interpolating_filter(
|
||||
#if USE_TEMPORALFILTER_12TAP
|
||||
extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16];
|
||||
extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16];
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
extern const
|
||||
int16_t sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if CONFIG_EXT_INTERP
|
||||
@@ -102,15 +106,26 @@ extern const int8_t sub_pel_filters_12sharp_signal_dir[15][2][16];
|
||||
extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16];
|
||||
extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16];
|
||||
extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16];
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
extern const int16_t sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8];
|
||||
extern const int16_t sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8];
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef const int8_t (*SubpelFilterCoeffs)[16];
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
typedef const int16_t (*HbdSubpelFilterCoeffs)[8];
|
||||
#endif
|
||||
|
||||
SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
|
||||
const InterpFilterParams p, int index);
|
||||
|
||||
SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
|
||||
const InterpFilterParams p, int index);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
|
||||
const InterpFilterParams p, int index);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
@@ -182,7 +182,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static void highbd_convolve_horiz(const uint16_t *src, int src_stride,
|
||||
void vp10_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
|
||||
uint16_t *dst, int dst_stride, int w, int h,
|
||||
const InterpFilterParams filter_params,
|
||||
const int subpel_x_q4, int x_step_q4, int avg,
|
||||
@@ -213,7 +213,7 @@ static void highbd_convolve_horiz(const uint16_t *src, int src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_convolve_vert(const uint16_t *src, int src_stride,
|
||||
void vp10_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
|
||||
uint16_t *dst, int dst_stride, int w, int h,
|
||||
const InterpFilterParams filter_params,
|
||||
const int subpel_y_q4, int y_step_q4, int avg,
|
||||
@@ -300,8 +300,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
|
||||
InterpFilterParams filter_params =
|
||||
vp10_get_interp_filter_params(interp_filter);
|
||||
#endif
|
||||
highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_x_q4, x_step_q4, ref_idx, bd);
|
||||
vp10_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params, subpel_x_q4, x_step_q4, ref_idx,
|
||||
bd);
|
||||
} else if (ignore_horiz) {
|
||||
#if CONFIG_DUAL_FILTER
|
||||
InterpFilterParams filter_params =
|
||||
@@ -310,8 +311,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
|
||||
InterpFilterParams filter_params =
|
||||
vp10_get_interp_filter_params(interp_filter);
|
||||
#endif
|
||||
highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_y_q4, y_step_q4, ref_idx, bd);
|
||||
vp10_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params, subpel_y_q4, y_step_q4, ref_idx,
|
||||
bd);
|
||||
} else {
|
||||
// temp's size is set to (maximum possible intermediate_height) *
|
||||
// MAX_BLOCK_WIDTH
|
||||
@@ -336,9 +338,10 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
|
||||
int intermediate_height =
|
||||
(((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
|
||||
|
||||
highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
|
||||
temp, temp_stride, w, intermediate_height,
|
||||
filter_params, subpel_x_q4, x_step_q4, 0, bd);
|
||||
vp10_highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1),
|
||||
src_stride, temp, temp_stride, w,
|
||||
intermediate_height, filter_params, subpel_x_q4,
|
||||
x_step_q4, 0, bd);
|
||||
|
||||
#if CONFIG_DUAL_FILTER
|
||||
filter_params = filter_params_y;
|
||||
@@ -346,9 +349,9 @@ void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
|
||||
filter_size = filter_params.taps;
|
||||
assert(filter_params.taps <= MAX_FILTER_TAP);
|
||||
|
||||
highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
|
||||
temp_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_y_q4, y_step_q4, ref_idx, bd);
|
||||
vp10_highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
|
||||
temp_stride, dst, dst_stride, w, h, filter_params,
|
||||
subpel_y_q4, y_step_q4, ref_idx, bd);
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
@@ -93,6 +93,13 @@ specialize qw/vp10_convolve_horiz ssse3/;
|
||||
add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
|
||||
specialize qw/vp10_convolve_vert ssse3/;
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void vp10_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
|
||||
specialize qw/vp10_highbd_convolve_horiz sse4_1/;
|
||||
add_proto qw/void vp10_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
|
||||
specialize qw/vp10_highbd_convolve_vert sse4_1/;
|
||||
}
|
||||
|
||||
#
|
||||
# dct
|
||||
#
|
||||
|
File diff suppressed because it is too large
Load Diff
393
vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
Normal file
393
vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
Normal file
@@ -0,0 +1,393 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
#include "./vpx_config.h"
|
||||
#include "vp10/common/filter.h"
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
#if CONFIG_EXT_INTERP
|
||||
DECLARE_ALIGNED(16, const int16_t,
|
||||
sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8]) = {
|
||||
{
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -6, 127, -6, 127, -6, 127, -6, 127, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -2, 5, -2, 5, -2, 5, -2, 5, },
|
||||
{-12, 124, -12, 124, -12, 124, -12, 124, },
|
||||
{ 18, -7, 18, -7, 18, -7, 18, -7, },
|
||||
{ 3, -2, 3, -2, 3, -2, 3, -2, },
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -3, 7, -3, 7, -3, 7, -3, 7, },
|
||||
{-17, 119, -17, 119, -17, 119, -17, 119, },
|
||||
{ 28, -11, 28, -11, 28, -11, 28, -11, },
|
||||
{ 5, -2, 5, -2, 5, -2, 5, -2, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{-20, 114, -20, 114, -20, 114, -20, 114, },
|
||||
{ 38, -14, 38, -14, 38, -14, 38, -14, },
|
||||
{ 7, -3, 7, -3, 7, -3, 7, -3, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -4, 9, -4, 9, -4, 9, -4, 9, },
|
||||
{-22, 107, -22, 107, -22, 107, -22, 107, },
|
||||
{ 49, -17, 49, -17, 49, -17, 49, -17, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 2, 0, 2, 0, 2, 0, 2, },
|
||||
{ -5, 10, -5, 10, -5, 10, -5, 10, },
|
||||
{-24, 99, -24, 99, -24, 99, -24, 99, },
|
||||
{ 59, -20, 59, -20, 59, -20, 59, -20, },
|
||||
{ 9, -4, 9, -4, 9, -4, 9, -4, },
|
||||
{ 2, 0, 2, 0, 2, 0, 2, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 2, 0, 2, 0, 2, 0, 2, },
|
||||
{ -5, 10, -5, 10, -5, 10, -5, 10, },
|
||||
{-24, 90, -24, 90, -24, 90, -24, 90, },
|
||||
{ 70, -22, 70, -22, 70, -22, 70, -22, },
|
||||
{ 10, -5, 10, -5, 10, -5, 10, -5, },
|
||||
{ 2, 0, 2, 0, 2, 0, 2, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 2, 0, 2, 0, 2, 0, 2, },
|
||||
{ -5, 10, -5, 10, -5, 10, -5, 10, },
|
||||
{-23, 80, -23, 80, -23, 80, -23, 80, },
|
||||
{ 80, -23, 80, -23, 80, -23, 80, -23, },
|
||||
{ 10, -5, 10, -5, 10, -5, 10, -5, },
|
||||
{ 2, 0, 2, 0, 2, 0, 2, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 2, 0, 2, 0, 2, 0, 2, },
|
||||
{ -5, 10, -5, 10, -5, 10, -5, 10, },
|
||||
{-22, 70, -22, 70, -22, 70, -22, 70, },
|
||||
{ 90, -24, 90, -24, 90, -24, 90, -24, },
|
||||
{ 10, -5, 10, -5, 10, -5, 10, -5, },
|
||||
{ 2, 0, 2, 0, 2, 0, 2, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 2, 0, 2, 0, 2, 0, 2, },
|
||||
{ -4, 9, -4, 9, -4, 9, -4, 9, },
|
||||
{-20, 59, -20, 59, -20, 59, -20, 59, },
|
||||
{ 99, -24, 99, -24, 99, -24, 99, -24, },
|
||||
{ 10, -5, 10, -5, 10, -5, 10, -5, },
|
||||
{ 2, 0, 2, 0, 2, 0, 2, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{-17, 49, -17, 49, -17, 49, -17, 49, },
|
||||
{107, -22, 107, -22, 107, -22, 107, -22, },
|
||||
{ 9, -4, 9, -4, 9, -4, 9, -4, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -3, 7, -3, 7, -3, 7, -3, 7, },
|
||||
{-14, 38, -14, 38, -14, 38, -14, 38, },
|
||||
{114, -20, 114, -20, 114, -20, 114, -20, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -2, 5, -2, 5, -2, 5, -2, 5, },
|
||||
{-11, 28, -11, 28, -11, 28, -11, 28, },
|
||||
{119, -17, 119, -17, 119, -17, 119, -17, },
|
||||
{ 7, -3, 7, -3, 7, -3, 7, -3, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
{ -2, 3, -2, 3, -2, 3, -2, 3, },
|
||||
{ -7, 18, -7, 18, -7, 18, -7, 18, },
|
||||
{124, -12, 124, -12, 124, -12, 124, -12, },
|
||||
{ 5, -2, 5, -2, 5, -2, 5, -2, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{127, -6, 127, -6, 127, -6, 127, -6, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
},
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
#if CONFIG_EXT_INTERP
|
||||
DECLARE_ALIGNED(16, const int16_t,
|
||||
sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8]) = {
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -2, 3, -2, 3, -2, 3, -2, 3, },
|
||||
{ -7, 127, -7, 127, -7, 127, -7, 127, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -3, 6, -3, 6, -3, 6, -3, 6, },
|
||||
{-13, 124, -13, 124, -13, 124, -13, 124, },
|
||||
{ 18, -8, 18, -8, 18, -8, 18, -8, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{-18, 120, -18, 120, -18, 120, -18, 120, },
|
||||
{ 28, -12, 28, -12, 28, -12, 28, -12, },
|
||||
{ 7, -4, 7, -4, 7, -4, 7, -4, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -6, 10, -6, 10, -6, 10, -6, 10, },
|
||||
{-21, 115, -21, 115, -21, 115, -21, 115, },
|
||||
{ 38, -15, 38, -15, 38, -15, 38, -15, },
|
||||
{ 8, -5, 8, -5, 8, -5, 8, -5, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -6, 12, -6, 12, -6, 12, -6, 12, },
|
||||
{-24, 108, -24, 108, -24, 108, -24, 108, },
|
||||
{ 49, -18, 49, -18, 49, -18, 49, -18, },
|
||||
{ 10, -6, 10, -6, 10, -6, 10, -6, },
|
||||
{ 3, -2, 3, -2, 3, -2, 3, -2, },
|
||||
},
|
||||
{
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -7, 13, -7, 13, -7, 13, -7, 13, },
|
||||
{-25, 100, -25, 100, -25, 100, -25, 100, },
|
||||
{ 60, -21, 60, -21, 60, -21, 60, -21, },
|
||||
{ 11, -7, 11, -7, 11, -7, 11, -7, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
},
|
||||
{
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -7, 13, -7, 13, -7, 13, -7, 13, },
|
||||
{-26, 91, -26, 91, -26, 91, -26, 91, },
|
||||
{ 71, -24, 71, -24, 71, -24, 71, -24, },
|
||||
{ 13, -7, 13, -7, 13, -7, 13, -7, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
},
|
||||
{
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -7, 13, -7, 13, -7, 13, -7, 13, },
|
||||
{-25, 81, -25, 81, -25, 81, -25, 81, },
|
||||
{ 81, -25, 81, -25, 81, -25, 81, -25, },
|
||||
{ 13, -7, 13, -7, 13, -7, 13, -7, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
},
|
||||
{
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -7, 13, -7, 13, -7, 13, -7, 13, },
|
||||
{-24, 71, -24, 71, -24, 71, -24, 71, },
|
||||
{ 91, -26, 91, -26, 91, -26, 91, -26, },
|
||||
{ 13, -7, 13, -7, 13, -7, 13, -7, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
},
|
||||
{
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -7, 11, -7, 11, -7, 11, -7, 11, },
|
||||
{-21, 60, -21, 60, -21, 60, -21, 60, },
|
||||
{100, -25, 100, -25, 100, -25, 100, -25, },
|
||||
{ 13, -7, 13, -7, 13, -7, 13, -7, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
},
|
||||
{
|
||||
{ -2, 3, -2, 3, -2, 3, -2, 3, },
|
||||
{ -6, 10, -6, 10, -6, 10, -6, 10, },
|
||||
{-18, 49, -18, 49, -18, 49, -18, 49, },
|
||||
{108, -24, 108, -24, 108, -24, 108, -24, },
|
||||
{ 12, -6, 12, -6, 12, -6, 12, -6, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -5, 8, -5, 8, -5, 8, -5, 8, },
|
||||
{-15, 38, -15, 38, -15, 38, -15, 38, },
|
||||
{115, -21, 115, -21, 115, -21, 115, -21, },
|
||||
{ 10, -6, 10, -6, 10, -6, 10, -6, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -4, 7, -4, 7, -4, 7, -4, 7, },
|
||||
{-12, 28, -12, 28, -12, 28, -12, 28, },
|
||||
{120, -18, 120, -18, 120, -18, 120, -18, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -8, 18, -8, 18, -8, 18, -8, 18, },
|
||||
{124, -13, 124, -13, 124, -13, 124, -13, },
|
||||
{ 6, -3, 6, -3, 6, -3, 6, -3, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{127, -7, 127, -7, 127, -7, 127, -7, },
|
||||
{ 3, -2, 3, -2, 3, -2, 3, -2, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
};
|
||||
#endif
|
||||
#endif
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
#if USE_TEMPORALFILTER_12TAP
|
||||
DECLARE_ALIGNED(16, const int16_t,
|
||||
sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8]) = {
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -7, 127, -7, 127, -7, 127, -7, 127, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -3, 5, -3, 5, -3, 5, -3, 5, },
|
||||
{-12, 124, -12, 124, -12, 124, -12, 124, },
|
||||
{ 18, -8, 18, -8, 18, -8, 18, -8, },
|
||||
{ 4, -2, 4, -2, 4, -2, 4, -2, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{-17, 120, -17, 120, -17, 120, -17, 120, },
|
||||
{ 28, -11, 28, -11, 28, -11, 28, -11, },
|
||||
{ 6, -3, 6, -3, 6, -3, 6, -3, },
|
||||
{ 1, -1, 1, -1, 1, -1, 1, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -4, 10, -4, 10, -4, 10, -4, 10, },
|
||||
{-21, 114, -21, 114, -21, 114, -21, 114, },
|
||||
{ 38, -15, 38, -15, 38, -15, 38, -15, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -5, 11, -5, 11, -5, 11, -5, 11, },
|
||||
{-23, 107, -23, 107, -23, 107, -23, 107, },
|
||||
{ 49, -18, 49, -18, 49, -18, 49, -18, },
|
||||
{ 9, -5, 9, -5, 9, -5, 9, -5, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -6, 12, -6, 12, -6, 12, -6, 12, },
|
||||
{-25, 99, -25, 99, -25, 99, -25, 99, },
|
||||
{ 60, -21, 60, -21, 60, -21, 60, -21, },
|
||||
{ 11, -6, 11, -6, 11, -6, 11, -6, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -6, 12, -6, 12, -6, 12, -6, 12, },
|
||||
{-25, 90, -25, 90, -25, 90, -25, 90, },
|
||||
{ 70, -23, 70, -23, 70, -23, 70, -23, },
|
||||
{ 12, -6, 12, -6, 12, -6, 12, -6, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -6, 12, -6, 12, -6, 12, -6, 12, },
|
||||
{-24, 80, -24, 80, -24, 80, -24, 80, },
|
||||
{ 80, -24, 80, -24, 80, -24, 80, -24, },
|
||||
{ 12, -6, 12, -6, 12, -6, 12, -6, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -6, 12, -6, 12, -6, 12, -6, 12, },
|
||||
{-23, 70, -23, 70, -23, 70, -23, 70, },
|
||||
{ 90, -25, 90, -25, 90, -25, 90, -25, },
|
||||
{ 12, -6, 12, -6, 12, -6, 12, -6, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 3, -1, 3, -1, 3, -1, 3, },
|
||||
{ -6, 11, -6, 11, -6, 11, -6, 11, },
|
||||
{-21, 60, -21, 60, -21, 60, -21, 60, },
|
||||
{ 99, -25, 99, -25, 99, -25, 99, -25, },
|
||||
{ 12, -6, 12, -6, 12, -6, 12, -6, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -5, 9, -5, 9, -5, 9, -5, 9, },
|
||||
{-18, 49, -18, 49, -18, 49, -18, 49, },
|
||||
{107, -23, 107, -23, 107, -23, 107, -23, },
|
||||
{ 11, -5, 11, -5, 11, -5, 11, -5, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{-15, 38, -15, 38, -15, 38, -15, 38, },
|
||||
{114, -21, 114, -21, 114, -21, 114, -21, },
|
||||
{ 10, -4, 10, -4, 10, -4, 10, -4, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
},
|
||||
{
|
||||
{ -1, 1, -1, 1, -1, 1, -1, 1, },
|
||||
{ -3, 6, -3, 6, -3, 6, -3, 6, },
|
||||
{-11, 28, -11, 28, -11, 28, -11, 28, },
|
||||
{120, -17, 120, -17, 120, -17, 120, -17, },
|
||||
{ 8, -4, 8, -4, 8, -4, 8, -4, },
|
||||
{ 2, -1, 2, -1, 2, -1, 2, -1, },
|
||||
},
|
||||
{
|
||||
{ 0, 1, 0, 1, 0, 1, 0, 1, },
|
||||
{ -2, 4, -2, 4, -2, 4, -2, 4, },
|
||||
{ -8, 18, -8, 18, -8, 18, -8, 18, },
|
||||
{124, -12, 124, -12, 124, -12, 124, -12, },
|
||||
{ 5, -3, 5, -3, 5, -3, 5, -3, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
{
|
||||
{ 0, 0, 0, 0, 0, 0, 0, 0, },
|
||||
{ -1, 2, -1, 2, -1, 2, -1, 2, },
|
||||
{ -4, 8, -4, 8, -4, 8, -4, 8, },
|
||||
{127, -7, 127, -7, 127, -7, 127, -7, },
|
||||
{ 3, -1, 3, -1, 3, -1, 3, -1, },
|
||||
{ 1, 0, 1, 0, 1, 0, 1, 0, },
|
||||
},
|
||||
};
|
||||
#endif
|
||||
#endif
|
474
vp10/common/x86/vp10_highbd_convolve_sse4.c
Normal file
474
vp10/common/x86/vp10_highbd_convolve_sse4.c
Normal file
@@ -0,0 +1,474 @@
|
||||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <smmintrin.h>
|
||||
|
||||
#include "./vp10_rtcd.h"
|
||||
#include "vp10/common/filter.h"
|
||||
|
||||
typedef void (*TransposeSave)(const int width, int pixelsNum,
|
||||
uint32_t *src, int src_stride,
|
||||
uint16_t *dst, int dst_stride,
|
||||
int bd);
|
||||
|
||||
// pixelsNum 0: write all 4 pixels
|
||||
// 1/2/3: residual pixels 1/2/3
|
||||
static void writePixel(__m128i *u, int width, int pixelsNum,
|
||||
uint16_t *dst, int dst_stride) {
|
||||
if (2 == width) {
|
||||
if (0 == pixelsNum) {
|
||||
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
||||
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
|
||||
*(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
|
||||
*(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
|
||||
} else if (1 == pixelsNum) {
|
||||
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
||||
} else if (2 == pixelsNum) {
|
||||
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
||||
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
|
||||
} else if (3 == pixelsNum) {
|
||||
*(int *)dst = _mm_cvtsi128_si32(u[0]);
|
||||
*(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
|
||||
*(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
|
||||
}
|
||||
} else {
|
||||
if (0 == pixelsNum) {
|
||||
_mm_storel_epi64((__m128i *)dst, u[0]);
|
||||
_mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
|
||||
_mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
|
||||
_mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
|
||||
} else if (1 == pixelsNum) {
|
||||
_mm_storel_epi64((__m128i *)dst, u[0]);
|
||||
} else if (2 == pixelsNum) {
|
||||
_mm_storel_epi64((__m128i *)dst, u[0]);
|
||||
_mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
|
||||
} else if (3 == pixelsNum) {
|
||||
_mm_storel_epi64((__m128i *)dst, u[0]);
|
||||
_mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
|
||||
_mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16-bit pixels clip with bd (10/12)
|
||||
static void highbd_clip(__m128i *p, int numVecs, int bd) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i one = _mm_set1_epi16(1);
|
||||
const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
|
||||
__m128i clamped, mask;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < numVecs; i++) {
|
||||
mask = _mm_cmpgt_epi16(p[i], max);
|
||||
clamped = _mm_andnot_si128(mask, p[i]);
|
||||
mask = _mm_and_si128(mask, max);
|
||||
clamped = _mm_or_si128(mask, clamped);
|
||||
mask = _mm_cmpgt_epi16(clamped, zero);
|
||||
p[i] = _mm_and_si128(clamped, mask);
|
||||
}
|
||||
}
|
||||
|
||||
static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
|
||||
__m128i v0, v1;
|
||||
__m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
|
||||
|
||||
u[0] = _mm_loadu_si128((__m128i const *)src);
|
||||
u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
|
||||
u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
|
||||
u[0] = _mm_add_epi32(u[0], rnd);
|
||||
u[1] = _mm_add_epi32(u[1], rnd);
|
||||
u[2] = _mm_add_epi32(u[2], rnd);
|
||||
u[3] = _mm_add_epi32(u[3], rnd);
|
||||
|
||||
u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
|
||||
u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
|
||||
u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
|
||||
u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
|
||||
|
||||
u[0] = _mm_packus_epi32(u[0], u[1]);
|
||||
u[1] = _mm_packus_epi32(u[2], u[3]);
|
||||
|
||||
highbd_clip(u, 2, bd);
|
||||
|
||||
v0 = _mm_unpacklo_epi16(u[0], u[1]);
|
||||
v1 = _mm_unpackhi_epi16(u[0], u[1]);
|
||||
|
||||
u[0] = _mm_unpacklo_epi16(v0, v1);
|
||||
u[2] = _mm_unpackhi_epi16(v0, v1);
|
||||
|
||||
u[1] = _mm_srli_si128(u[0], 8);
|
||||
u[3] = _mm_srli_si128(u[2], 8);
|
||||
}
|
||||
|
||||
// pixelsNum = 0 : all 4 rows of pixels will be saved.
|
||||
// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
|
||||
void trans_save_4x4(const int width, int pixelsNum,
|
||||
uint32_t *src, int src_stride,
|
||||
uint16_t *dst, int dst_stride,
|
||||
int bd) {
|
||||
__m128i u[4];
|
||||
transClipPixel(src, src_stride, u, bd);
|
||||
writePixel(u, width, pixelsNum, dst, dst_stride);
|
||||
}
|
||||
|
||||
void trans_accum_save_4x4(const int width, int pixelsNum,
|
||||
uint32_t *src, int src_stride,
|
||||
uint16_t *dst, int dst_stride,
|
||||
int bd) {
|
||||
__m128i u[4], v[4];
|
||||
const __m128i ones = _mm_set1_epi16(1);
|
||||
|
||||
transClipPixel(src, src_stride, u, bd);
|
||||
|
||||
v[0] = _mm_loadl_epi64((__m128i const *)dst);
|
||||
v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
|
||||
v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
|
||||
v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
|
||||
|
||||
u[0] = _mm_add_epi16(u[0], v[0]);
|
||||
u[1] = _mm_add_epi16(u[1], v[1]);
|
||||
u[2] = _mm_add_epi16(u[2], v[2]);
|
||||
u[3] = _mm_add_epi16(u[3], v[3]);
|
||||
|
||||
u[0] = _mm_add_epi16(u[0], ones);
|
||||
u[1] = _mm_add_epi16(u[1], ones);
|
||||
u[2] = _mm_add_epi16(u[2], ones);
|
||||
u[3] = _mm_add_epi16(u[3], ones);
|
||||
|
||||
u[0] = _mm_srai_epi16(u[0], 1);
|
||||
u[1] = _mm_srai_epi16(u[1], 1);
|
||||
u[2] = _mm_srai_epi16(u[2], 1);
|
||||
u[3] = _mm_srai_epi16(u[3], 1);
|
||||
|
||||
writePixel(u, width, pixelsNum, dst, dst_stride);
|
||||
}
|
||||
|
||||
static TransposeSave transSaveTab[2] = {
|
||||
trans_save_4x4, trans_accum_save_4x4};
|
||||
|
||||
static INLINE void transpose_pair(__m128i *in, __m128i *out) {
|
||||
__m128i x0, x1;
|
||||
|
||||
x0 = _mm_unpacklo_epi32(in[0], in[1]);
|
||||
x1 = _mm_unpacklo_epi32(in[2], in[3]);
|
||||
|
||||
out[0] = _mm_unpacklo_epi64(x0, x1);
|
||||
out[1] = _mm_unpackhi_epi64(x0, x1);
|
||||
|
||||
x0 = _mm_unpackhi_epi32(in[0], in[1]);
|
||||
x1 = _mm_unpackhi_epi32(in[2], in[3]);
|
||||
|
||||
out[2] = _mm_unpacklo_epi64(x0, x1);
|
||||
out[3] = _mm_unpackhi_epi64(x0, x1);
|
||||
|
||||
x0 = _mm_unpacklo_epi32(in[4], in[5]);
|
||||
x1 = _mm_unpacklo_epi32(in[6], in[7]);
|
||||
|
||||
out[4] = _mm_unpacklo_epi64(x0, x1);
|
||||
out[5] = _mm_unpackhi_epi64(x0, x1);
|
||||
}
|
||||
|
||||
static void highbd_filter_horiz(const uint16_t *src, int src_stride,
|
||||
__m128i *f, int tapsNum, uint32_t *buf) {
|
||||
__m128i u[8], v[6];
|
||||
|
||||
if (tapsNum == 10) {
|
||||
src -= 1;
|
||||
}
|
||||
|
||||
u[0] = _mm_loadu_si128((__m128i const *)src);
|
||||
u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
|
||||
u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
|
||||
u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
|
||||
|
||||
u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
|
||||
u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
|
||||
u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
|
||||
u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
|
||||
|
||||
transpose_pair(u, v);
|
||||
|
||||
u[0] = _mm_madd_epi16(v[0], f[0]);
|
||||
u[1] = _mm_madd_epi16(v[1], f[1]);
|
||||
u[2] = _mm_madd_epi16(v[2], f[2]);
|
||||
u[3] = _mm_madd_epi16(v[3], f[3]);
|
||||
u[4] = _mm_madd_epi16(v[4], f[4]);
|
||||
u[5] = _mm_madd_epi16(v[5], f[5]);
|
||||
|
||||
u[6] = _mm_min_epi32(u[2], u[3]);
|
||||
u[7] = _mm_max_epi32(u[2], u[3]);
|
||||
|
||||
u[0] = _mm_add_epi32(u[0], u[1]);
|
||||
u[0] = _mm_add_epi32(u[0], u[5]);
|
||||
u[0] = _mm_add_epi32(u[0], u[4]);
|
||||
u[0] = _mm_add_epi32(u[0], u[6]);
|
||||
u[0] = _mm_add_epi32(u[0], u[7]);
|
||||
|
||||
_mm_storeu_si128((__m128i *)buf, u[0]);
|
||||
}
|
||||
|
||||
void vp10_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
|
||||
uint16_t *dst, int dst_stride,
|
||||
int w, int h,
|
||||
const InterpFilterParams filter_params,
|
||||
const int subpel_x_q4, int x_step_q4,
|
||||
int avg, int bd) {
|
||||
DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
|
||||
__m128i verf[6];
|
||||
HbdSubpelFilterCoeffs vCoeffs;
|
||||
const uint16_t *srcPtr;
|
||||
const int tapsNum = filter_params.taps;
|
||||
int i, col, count, blkResidu, blkHeight;
|
||||
TransposeSave transSave = transSaveTab[avg];
|
||||
(void)x_step_q4;
|
||||
|
||||
if (0 == subpel_x_q4 || 16 != x_step_q4) {
|
||||
vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params, subpel_x_q4, x_step_q4, avg,
|
||||
bd);
|
||||
return;
|
||||
}
|
||||
|
||||
vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
|
||||
filter_params, subpel_x_q4 - 1);
|
||||
if (!vCoeffs) {
|
||||
vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params, subpel_x_q4, x_step_q4, avg,
|
||||
bd);
|
||||
return;
|
||||
}
|
||||
|
||||
verf[0] = *((const __m128i *)(vCoeffs));
|
||||
verf[1] = *((const __m128i *)(vCoeffs + 1));
|
||||
verf[2] = *((const __m128i *)(vCoeffs + 2));
|
||||
verf[3] = *((const __m128i *)(vCoeffs + 3));
|
||||
verf[4] = *((const __m128i *)(vCoeffs + 4));
|
||||
verf[5] = *((const __m128i *)(vCoeffs + 5));
|
||||
|
||||
src -= (tapsNum >> 1) - 1;
|
||||
srcPtr = src;
|
||||
|
||||
count = 0;
|
||||
blkHeight = h >> 2;
|
||||
blkResidu = h & 3;
|
||||
|
||||
while (blkHeight != 0) {
|
||||
for (col = 0; col < w; col += 4) {
|
||||
for (i = 0; i < 4; ++i) {
|
||||
highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
|
||||
srcPtr += 1;
|
||||
}
|
||||
transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
|
||||
}
|
||||
count++;
|
||||
srcPtr = src + count * src_stride * 4;
|
||||
dst += dst_stride * 4;
|
||||
blkHeight--;
|
||||
}
|
||||
|
||||
for (col = 0; col < w; col += 4) {
|
||||
for (i = 0; i < 4; ++i) {
|
||||
highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
|
||||
srcPtr += 1;
|
||||
}
|
||||
transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
|
||||
}
|
||||
}
|
||||
|
||||
// Vertical convolutional filter
|
||||
|
||||
typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
|
||||
|
||||
static void highbdRndingPacks(__m128i *u) {
|
||||
__m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
|
||||
u[0] = _mm_add_epi32(u[0], rnd);
|
||||
u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
|
||||
u[0] = _mm_packus_epi32(u[0], u[0]);
|
||||
}
|
||||
|
||||
static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
|
||||
highbdRndingPacks(u);
|
||||
highbd_clip(u, 1, bd);
|
||||
*(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
|
||||
}
|
||||
|
||||
static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
|
||||
__m128i v = _mm_loadl_epi64((__m128i const *)dst);
|
||||
const __m128i ones = _mm_set1_epi16(1);
|
||||
|
||||
highbdRndingPacks(u);
|
||||
highbd_clip(u, 1, bd);
|
||||
|
||||
v = _mm_add_epi16(v, u[0]);
|
||||
v = _mm_add_epi16(v, ones);
|
||||
v = _mm_srai_epi16(v, 1);
|
||||
*(uint32_t *)dst = _mm_cvtsi128_si32(v);
|
||||
}
|
||||
|
||||
WritePixels write2pixelsTab[2] = {write2pixelsOnly, write2pixelsAccum};
|
||||
|
||||
static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
|
||||
highbdRndingPacks(u);
|
||||
highbd_clip(u, 1, bd);
|
||||
_mm_storel_epi64((__m128i *)dst, u[0]);
|
||||
}
|
||||
|
||||
static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
|
||||
__m128i v = _mm_loadl_epi64((__m128i const *)dst);
|
||||
const __m128i ones = _mm_set1_epi16(1);
|
||||
|
||||
highbdRndingPacks(u);
|
||||
highbd_clip(u, 1, bd);
|
||||
|
||||
v = _mm_add_epi16(v, u[0]);
|
||||
v = _mm_add_epi16(v, ones);
|
||||
v = _mm_srai_epi16(v, 1);
|
||||
_mm_storel_epi64((__m128i *)dst, v);
|
||||
}
|
||||
|
||||
WritePixels write4pixelsTab[2] = {write4pixelsOnly, write4pixelsAccum};
|
||||
|
||||
static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
|
||||
const __m128i *f, int taps,
|
||||
uint16_t *dst, WritePixels saveFunc,
|
||||
int bd) {
|
||||
__m128i s[12];
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
int i = 0;
|
||||
int r = 0;
|
||||
|
||||
// TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
|
||||
if (10 == taps) {
|
||||
i += 1;
|
||||
s[0] = zero;
|
||||
}
|
||||
while (i < 12) {
|
||||
s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
|
||||
i += 1;
|
||||
r += 1;
|
||||
}
|
||||
|
||||
s[0] = _mm_unpacklo_epi16(s[0], s[1]);
|
||||
s[2] = _mm_unpacklo_epi16(s[2], s[3]);
|
||||
s[4] = _mm_unpacklo_epi16(s[4], s[5]);
|
||||
s[6] = _mm_unpacklo_epi16(s[6], s[7]);
|
||||
s[8] = _mm_unpacklo_epi16(s[8], s[9]);
|
||||
s[10] = _mm_unpacklo_epi16(s[10], s[11]);
|
||||
|
||||
s[0] = _mm_madd_epi16(s[0], f[0]);
|
||||
s[2] = _mm_madd_epi16(s[2], f[1]);
|
||||
s[4] = _mm_madd_epi16(s[4], f[2]);
|
||||
s[6] = _mm_madd_epi16(s[6], f[3]);
|
||||
s[8] = _mm_madd_epi16(s[8], f[4]);
|
||||
s[10] = _mm_madd_epi16(s[10], f[5]);
|
||||
|
||||
s[1] = _mm_min_epi32(s[4], s[6]);
|
||||
s[3] = _mm_max_epi32(s[4], s[6]);
|
||||
|
||||
s[0] = _mm_add_epi32(s[0], s[2]);
|
||||
s[0] = _mm_add_epi32(s[0], s[10]);
|
||||
s[0] = _mm_add_epi32(s[0], s[8]);
|
||||
s[0] = _mm_add_epi32(s[0], s[1]);
|
||||
s[0] = _mm_add_epi32(s[0], s[3]);
|
||||
|
||||
saveFunc(s, bd, dst);
|
||||
}
|
||||
|
||||
static void highbd_filter_vert_compute_large(const uint16_t *src,
|
||||
int src_stride,
|
||||
const __m128i *f, int taps,
|
||||
int w, int h,
|
||||
uint16_t *dst, int dst_stride,
|
||||
int avg, int bd) {
|
||||
int col;
|
||||
int rowIndex = 0;
|
||||
const uint16_t *src_ptr = src;
|
||||
uint16_t *dst_ptr = dst;
|
||||
const int step = 4;
|
||||
WritePixels write4pixels = write4pixelsTab[avg];
|
||||
|
||||
do {
|
||||
for (col = 0; col < w; col += step) {
|
||||
filter_vert_horiz_parallel(src_ptr, src_stride, f, taps,
|
||||
dst_ptr, write4pixels, bd);
|
||||
src_ptr += step;
|
||||
dst_ptr += step;
|
||||
}
|
||||
rowIndex++;
|
||||
src_ptr = src + rowIndex * src_stride;
|
||||
dst_ptr = dst + rowIndex * dst_stride;
|
||||
} while (rowIndex < h);
|
||||
}
|
||||
|
||||
static void highbd_filter_vert_compute_small(const uint16_t *src,
|
||||
int src_stride,
|
||||
const __m128i *f, int taps,
|
||||
int w, int h,
|
||||
uint16_t *dst, int dst_stride,
|
||||
int avg, int bd) {
|
||||
int rowIndex = 0;
|
||||
WritePixels write2pixels = write2pixelsTab[avg];
|
||||
(void)w;
|
||||
|
||||
do {
|
||||
filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels,
|
||||
bd);
|
||||
rowIndex++;
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
} while (rowIndex < h);
|
||||
}
|
||||
|
||||
void vp10_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
|
||||
uint16_t *dst, int dst_stride,
|
||||
int w, int h,
|
||||
const InterpFilterParams filter_params,
|
||||
const int subpel_y_q4, int y_step_q4,
|
||||
int avg, int bd) {
|
||||
__m128i verf[6];
|
||||
HbdSubpelFilterCoeffs vCoeffs;
|
||||
const int tapsNum = filter_params.taps;
|
||||
|
||||
if (0 == subpel_y_q4 || 16 != y_step_q4) {
|
||||
vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params, subpel_y_q4, y_step_q4, avg,
|
||||
bd);
|
||||
return;
|
||||
}
|
||||
|
||||
vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
|
||||
filter_params, subpel_y_q4 - 1);
|
||||
if (!vCoeffs) {
|
||||
vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
|
||||
filter_params, subpel_y_q4, y_step_q4, avg,
|
||||
bd);
|
||||
return;
|
||||
}
|
||||
|
||||
verf[0] = *((const __m128i *)(vCoeffs));
|
||||
verf[1] = *((const __m128i *)(vCoeffs + 1));
|
||||
verf[2] = *((const __m128i *)(vCoeffs + 2));
|
||||
verf[3] = *((const __m128i *)(vCoeffs + 3));
|
||||
verf[4] = *((const __m128i *)(vCoeffs + 4));
|
||||
verf[5] = *((const __m128i *)(vCoeffs + 5));
|
||||
|
||||
src -= src_stride * ((tapsNum >> 1) - 1);
|
||||
|
||||
if (w > 2) {
|
||||
highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h,
|
||||
dst, dst_stride, avg, bd);
|
||||
} else {
|
||||
highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h,
|
||||
dst, dst_stride, avg, bd);
|
||||
}
|
||||
}
|
@@ -74,6 +74,10 @@ VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.c
|
||||
VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
|
||||
VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_ssse3.c
|
||||
VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_filters_ssse3.c
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_sse4.c
|
||||
VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_filters_sse4.c
|
||||
endif
|
||||
VP10_COMMON_SRCS-yes += common/vp10_convolve.c
|
||||
VP10_COMMON_SRCS-yes += common/vp10_convolve.h
|
||||
VP10_COMMON_SRCS-$(CONFIG_ANS) += common/ans.h
|
||||
|
Reference in New Issue
Block a user