Merge "Optimization for HBD filter intra predictors (SSE4.1)" into nextgenv2
This commit is contained in:
@@ -32,6 +32,20 @@ typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
|
||||
typedef tuple<Predictor, Predictor, int> PredFuncMode;
|
||||
typedef tuple<PredFuncMode, int> PredParams;
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
typedef void (*HbdPredictor)(uint16_t *dst, ptrdiff_t stride, int bs,
|
||||
const uint16_t *above, const uint16_t *left,
|
||||
int bd);
|
||||
|
||||
// Note:
|
||||
// Test parameter list:
|
||||
// Reference predictor, optimized predictor, prediction mode, block size,
|
||||
// bit depth
|
||||
//
|
||||
typedef tuple<HbdPredictor, HbdPredictor, int> HbdPredFuncMode;
|
||||
typedef tuple<HbdPredFuncMode, int, int> HbdPredParams;
|
||||
#endif
|
||||
|
||||
const int MaxBlkSize = 32;
|
||||
|
||||
// By default, disable speed test
|
||||
@@ -136,6 +150,105 @@ class VP10IntraPredOptimzTest : public ::testing::TestWithParam<PredParams> {
|
||||
uint8_t *predRef_;
|
||||
};
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
class VP10HbdIntraPredOptimzTest :
|
||||
public ::testing::TestWithParam<HbdPredParams> {
|
||||
public:
|
||||
virtual ~VP10HbdIntraPredOptimzTest() {}
|
||||
virtual void SetUp() {
|
||||
HbdPredFuncMode funcMode = GET_PARAM(0);
|
||||
predFuncRef_ = std::tr1::get<0>(funcMode);
|
||||
predFunc_ = std::tr1::get<1>(funcMode);
|
||||
mode_ = std::tr1::get<2>(funcMode);
|
||||
blockSize_ = GET_PARAM(1);
|
||||
bd_ = GET_PARAM(2);
|
||||
|
||||
alloc_ = (uint16_t *)malloc((3 * MaxBlkSize + 2) * sizeof(alloc_[0]));
|
||||
predRef_ =
|
||||
(uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(predRef_[0]));
|
||||
pred_ = (uint16_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(pred_[0]));
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
delete[] alloc_;
|
||||
delete[] predRef_;
|
||||
delete[] pred_;
|
||||
libvpx_test::ClearSystemState();
|
||||
}
|
||||
|
||||
protected:
|
||||
void RunTest() const {
|
||||
int tstIndex = 0;
|
||||
int stride = blockSize_;
|
||||
uint16_t *left = alloc_;
|
||||
uint16_t *above = alloc_ + MaxBlkSize + 1;
|
||||
while (tstIndex < MaxTestNum) {
|
||||
PrepareBuffer();
|
||||
predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
|
||||
ASM_REGISTER_STATE_CHECK(
|
||||
predFunc_(pred_, stride, blockSize_, &above[1], left, bd_));
|
||||
DiffPred(tstIndex);
|
||||
tstIndex += 1;
|
||||
}
|
||||
}
|
||||
|
||||
void RunSpeedTestC() const {
|
||||
int tstIndex = 0;
|
||||
int stride = blockSize_;
|
||||
uint16_t *left = alloc_;
|
||||
uint16_t *above = alloc_ + MaxBlkSize + 1;
|
||||
PrepareBuffer();
|
||||
while (tstIndex < MaxTestNum) {
|
||||
predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
|
||||
tstIndex += 1;
|
||||
}
|
||||
}
|
||||
|
||||
void RunSpeedTestSSE() const {
|
||||
int tstIndex = 0;
|
||||
int stride = blockSize_;
|
||||
uint16_t *left = alloc_;
|
||||
uint16_t *above = alloc_ + MaxBlkSize + 1;
|
||||
PrepareBuffer();
|
||||
while (tstIndex < MaxTestNum) {
|
||||
predFunc_(predRef_, stride, blockSize_, &above[1], left, bd_);
|
||||
tstIndex += 1;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void PrepareBuffer() const {
|
||||
ACMRandom rnd(ACMRandom::DeterministicSeed());
|
||||
int i = 0;
|
||||
while (i < (3 * MaxBlkSize + 2)) {
|
||||
alloc_[i] = rnd.Rand16() & ((1 << bd_) - 1);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
void DiffPred(int testNum) const {
|
||||
int i = 0;
|
||||
while (i < blockSize_ * blockSize_) {
|
||||
EXPECT_EQ(predRef_[i], pred_[i])
|
||||
<< "Error at position: " << i << " "
|
||||
<< "Block size: " << blockSize_ << " "
|
||||
<< "Bit depth: " << bd_ << " "
|
||||
<< "Test number: " << testNum;
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
HbdPredictor predFunc_;
|
||||
HbdPredictor predFuncRef_;
|
||||
int mode_;
|
||||
int blockSize_;
|
||||
int bd_;
|
||||
uint16_t *alloc_;
|
||||
uint16_t *pred_;
|
||||
uint16_t *predRef_;
|
||||
};
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
TEST_P(VP10IntraPredOptimzTest, BitExactCheck) {
|
||||
RunTest();
|
||||
}
|
||||
@@ -150,6 +263,22 @@ TEST_P(VP10IntraPredOptimzTest, SpeedCheckSSE) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
TEST_P(VP10HbdIntraPredOptimzTest, BitExactCheck) {
|
||||
RunTest();
|
||||
}
|
||||
|
||||
#if PREDICTORS_SPEED_TEST
|
||||
TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckC) {
|
||||
RunSpeedTestC();
|
||||
}
|
||||
|
||||
TEST_P(VP10HbdIntraPredOptimzTest, SpeedCheckSSE) {
|
||||
RunSpeedTestSSE();
|
||||
}
|
||||
#endif // PREDICTORS_SPEED_TEST
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
const PredFuncMode kPredFuncMdArray[] = {
|
||||
@@ -183,4 +312,38 @@ INSTANTIATE_TEST_CASE_P(
|
||||
::testing::ValuesIn(kPredFuncMdArray),
|
||||
::testing::ValuesIn(kBlkSize)));
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
const HbdPredFuncMode kHbdPredFuncMdArray[] = {
|
||||
make_tuple(vp10_highbd_dc_filter_predictor_c,
|
||||
vp10_highbd_dc_filter_predictor_sse4_1, DC_PRED),
|
||||
make_tuple(vp10_highbd_v_filter_predictor_c,
|
||||
vp10_highbd_v_filter_predictor_sse4_1, V_PRED),
|
||||
make_tuple(vp10_highbd_h_filter_predictor_c,
|
||||
vp10_highbd_h_filter_predictor_sse4_1, H_PRED),
|
||||
make_tuple(vp10_highbd_d45_filter_predictor_c,
|
||||
vp10_highbd_d45_filter_predictor_sse4_1, D45_PRED),
|
||||
make_tuple(vp10_highbd_d135_filter_predictor_c,
|
||||
vp10_highbd_d135_filter_predictor_sse4_1, D135_PRED),
|
||||
make_tuple(vp10_highbd_d117_filter_predictor_c,
|
||||
vp10_highbd_d117_filter_predictor_sse4_1, D117_PRED),
|
||||
make_tuple(vp10_highbd_d153_filter_predictor_c,
|
||||
vp10_highbd_d153_filter_predictor_sse4_1, D153_PRED),
|
||||
make_tuple(vp10_highbd_d207_filter_predictor_c,
|
||||
vp10_highbd_d207_filter_predictor_sse4_1, D207_PRED),
|
||||
make_tuple(vp10_highbd_d63_filter_predictor_c,
|
||||
vp10_highbd_d63_filter_predictor_sse4_1, D63_PRED),
|
||||
make_tuple(vp10_highbd_tm_filter_predictor_c,
|
||||
vp10_highbd_tm_filter_predictor_sse4_1, TM_PRED),
|
||||
};
|
||||
|
||||
const int kBd[] = {10, 12};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE4_1, VP10HbdIntraPredOptimzTest,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(kHbdPredFuncMdArray),
|
||||
::testing::ValuesIn(kBlkSize),
|
||||
::testing::ValuesIn(kBd)));
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
} // namespace
|
||||
|
||||
@@ -1071,85 +1071,115 @@ static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
|
||||
}
|
||||
}
|
||||
|
||||
static void highbd_dc_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_v_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_h_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_d45_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_d135_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_d117_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_d153_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_d207_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_d63_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void highbd_tm_filter_predictor(uint16_t *dst, ptrdiff_t stride,
|
||||
void vp10_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED,
|
||||
bd);
|
||||
}
|
||||
|
||||
static void (*highbd_filter_intra_predictors[EXT_INTRA_MODES])(uint16_t *dst,
|
||||
ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left,
|
||||
int bd) = {
|
||||
highbd_dc_filter_predictor, highbd_v_filter_predictor,
|
||||
highbd_h_filter_predictor, highbd_d45_filter_predictor,
|
||||
highbd_d135_filter_predictor, highbd_d117_filter_predictor,
|
||||
highbd_d153_filter_predictor, highbd_d207_filter_predictor,
|
||||
highbd_d63_filter_predictor, highbd_tm_filter_predictor,
|
||||
};
|
||||
static void highbd_filter_intra_predictors(int mode, uint16_t *dst,
|
||||
ptrdiff_t stride, int bs,
|
||||
const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
switch (mode) {
|
||||
case DC_PRED:
|
||||
vp10_highbd_dc_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case V_PRED:
|
||||
vp10_highbd_v_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case H_PRED:
|
||||
vp10_highbd_h_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case D45_PRED:
|
||||
vp10_highbd_d45_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case D135_PRED:
|
||||
vp10_highbd_d135_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case D117_PRED:
|
||||
vp10_highbd_d117_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case D153_PRED:
|
||||
vp10_highbd_d153_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case D207_PRED:
|
||||
vp10_highbd_d207_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case D63_PRED:
|
||||
vp10_highbd_d63_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
case TM_PRED:
|
||||
vp10_highbd_tm_filter_predictor(dst, stride, bs, above, left, bd);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
#endif // CONFIG_EXT_INTRA
|
||||
|
||||
@@ -1303,7 +1333,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
|
||||
|
||||
#if CONFIG_EXT_INTRA
|
||||
if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
|
||||
highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
|
||||
highbd_filter_intra_predictors(ext_intra_mode, dst, dst_stride, bs,
|
||||
const_above_row, left_col, xd->bd);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -320,6 +320,29 @@ if (vpx_config("CONFIG_EXT_INTRA") eq "yes") {
|
||||
specialize qw/vp10_d63_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
|
||||
specialize qw/vp10_tm_filter_predictor sse4_1/;
|
||||
# High bitdepth functions
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/void vp10_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_dc_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_v_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_h_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_d45_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_d135_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_d117_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_d153_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_d207_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_d63_filter_predictor sse4_1/;
|
||||
add_proto qw/void vp10_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
|
||||
specialize qw/vp10_highbd_tm_filter_predictor sse4_1/;
|
||||
}
|
||||
}
|
||||
|
||||
# High bitdepth functions
|
||||
|
||||
@@ -591,3 +591,323 @@ void vp10_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
|
||||
GetIntraFilterParams(bs, TM_PRED, &prm[0]);
|
||||
FilterPrediction(above, left, bs, prm, dst, stride);
|
||||
}
|
||||
|
||||
// ============== High Bit Depth ==============
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
|
||||
const uint16_t *left, const int bd,
|
||||
__m128i *params) {
|
||||
const __m128i a = _mm_loadu_si128((const __m128i *)above);
|
||||
const __m128i l = _mm_loadu_si128((const __m128i *)left);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i sum_vector, u;
|
||||
uint16_t sum_value;
|
||||
(void)bd;
|
||||
|
||||
sum_vector = _mm_add_epi16(a, l);
|
||||
|
||||
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
|
||||
u = _mm_srli_si128(sum_vector, 2);
|
||||
sum_vector = _mm_add_epi16(sum_vector, u);
|
||||
|
||||
sum_value = _mm_extract_epi16(sum_vector, 0);
|
||||
sum_value += 4;
|
||||
sum_value >>= 3;
|
||||
*params = _mm_set1_epi32(sum_value);
|
||||
return sum_value;
|
||||
}
|
||||
|
||||
static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
|
||||
const uint16_t *left, const int bd,
|
||||
__m128i *params) {
|
||||
const __m128i a = _mm_loadu_si128((const __m128i *)above);
|
||||
const __m128i l = _mm_loadu_si128((const __m128i *)left);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i sum_vector, u;
|
||||
uint16_t sum_value;
|
||||
(void)bd;
|
||||
|
||||
sum_vector = _mm_add_epi16(a, l);
|
||||
|
||||
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
|
||||
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
|
||||
|
||||
u = _mm_srli_si128(sum_vector, 2);
|
||||
sum_vector = _mm_add_epi16(sum_vector, u);
|
||||
|
||||
sum_value = _mm_extract_epi16(sum_vector, 0);
|
||||
sum_value += 8;
|
||||
sum_value >>= 4;
|
||||
*params = _mm_set1_epi32(sum_value);
|
||||
return sum_value;
|
||||
}
|
||||
|
||||
// Note:
|
||||
// Process 16 pixels above and left, 10-bit depth
|
||||
// Add to the last 8 pixels sum
|
||||
static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
|
||||
__m128i *sum) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)above);
|
||||
__m128i l = _mm_loadu_si128((const __m128i *)left);
|
||||
sum[0] = _mm_add_epi16(a, l);
|
||||
a = _mm_loadu_si128((const __m128i *)(above + 8));
|
||||
l = _mm_loadu_si128((const __m128i *)(left + 8));
|
||||
sum[0] = _mm_add_epi16(sum[0], a);
|
||||
sum[0] = _mm_add_epi16(sum[0], l);
|
||||
}
|
||||
|
||||
// Note:
|
||||
// Process 16 pixels above and left, 12-bit depth
|
||||
// Add to the last 8 pixels sum
|
||||
static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
|
||||
__m128i *sum) {
|
||||
__m128i a = _mm_loadu_si128((const __m128i *)above);
|
||||
__m128i l = _mm_loadu_si128((const __m128i *)left);
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i v0, v1;
|
||||
|
||||
v0 = _mm_unpacklo_epi16(a, zero);
|
||||
v1 = _mm_unpacklo_epi16(l, zero);
|
||||
sum[0] = _mm_add_epi32(v0, v1);
|
||||
|
||||
v0 = _mm_unpackhi_epi16(a, zero);
|
||||
v1 = _mm_unpackhi_epi16(l, zero);
|
||||
sum[0] = _mm_add_epi32(sum[0], v0);
|
||||
sum[0] = _mm_add_epi32(sum[0], v1);
|
||||
|
||||
a = _mm_loadu_si128((const __m128i *)(above + 8));
|
||||
l = _mm_loadu_si128((const __m128i *)(left + 8));
|
||||
|
||||
v0 = _mm_unpacklo_epi16(a, zero);
|
||||
v1 = _mm_unpacklo_epi16(l, zero);
|
||||
sum[0] = _mm_add_epi32(sum[0], v0);
|
||||
sum[0] = _mm_add_epi32(sum[0], v1);
|
||||
|
||||
v0 = _mm_unpackhi_epi16(a, zero);
|
||||
v1 = _mm_unpackhi_epi16(l, zero);
|
||||
sum[0] = _mm_add_epi32(sum[0], v0);
|
||||
sum[0] = _mm_add_epi32(sum[0], v1);
|
||||
}
|
||||
|
||||
static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
|
||||
const uint16_t *left, const int bd,
|
||||
__m128i *params) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i sum_vector, u;
|
||||
uint32_t sum_value = 0;
|
||||
|
||||
if (10 == bd) {
|
||||
AddPixels10bit(above, left, &sum_vector);
|
||||
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
|
||||
sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
|
||||
|
||||
u = _mm_srli_si128(sum_vector, 2);
|
||||
sum_vector = _mm_add_epi16(sum_vector, u);
|
||||
sum_value = _mm_extract_epi16(sum_vector, 0);
|
||||
} else if (12 == bd) {
|
||||
AddPixels12bit(above, left, &sum_vector);
|
||||
|
||||
sum_vector = _mm_hadd_epi32(sum_vector, zero);
|
||||
u = _mm_srli_si128(sum_vector, 4);
|
||||
sum_vector = _mm_add_epi32(u, sum_vector);
|
||||
sum_value = _mm_extract_epi32(sum_vector, 0);
|
||||
}
|
||||
|
||||
sum_value += 16;
|
||||
sum_value >>= 5;
|
||||
*params = _mm_set1_epi32(sum_value);
|
||||
return sum_value;
|
||||
}
|
||||
|
||||
static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
|
||||
const uint16_t *left, const int bd,
|
||||
__m128i *params) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
__m128i sum_vector[2], u;
|
||||
uint32_t sum_value = 0;
|
||||
|
||||
if (10 == bd) {
|
||||
AddPixels10bit(above, left, &sum_vector[0]);
|
||||
AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
|
||||
|
||||
sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
|
||||
sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values
|
||||
sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values
|
||||
|
||||
u = _mm_srli_si128(sum_vector[0], 2);
|
||||
sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
|
||||
sum_value = _mm_extract_epi16(sum_vector[0], 0);
|
||||
} else if (12 == bd) {
|
||||
AddPixels12bit(above, left, &sum_vector[0]);
|
||||
AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
|
||||
|
||||
sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
|
||||
sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
|
||||
u = _mm_srli_si128(sum_vector[0], 4);
|
||||
sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
|
||||
sum_value = _mm_extract_epi32(sum_vector[0], 0);
|
||||
}
|
||||
|
||||
sum_value += 32;
|
||||
sum_value >>= 6;
|
||||
*params = _mm_set1_epi32(sum_value);
|
||||
return sum_value;
|
||||
}
|
||||
|
||||
// Note:
|
||||
// params[4] : mean value, 4 int32_t repetition
|
||||
//
|
||||
static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
|
||||
const uint16_t *left, int bs,
|
||||
const int bd, __m128i *params) {
|
||||
int meanValue = 0;
|
||||
switch (bs) {
|
||||
case 4:
|
||||
meanValue = HighbdGetMeanValue4x4(above, left, bd, params);
|
||||
break;
|
||||
case 8:
|
||||
meanValue = HighbdGetMeanValue8x8(above, left, bd, params);
|
||||
break;
|
||||
case 16:
|
||||
meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
|
||||
break;
|
||||
case 32:
|
||||
meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
return meanValue;
|
||||
}
|
||||
|
||||
// Note:
|
||||
// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
|
||||
// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
|
||||
static void HighbdGeneratePrediction(const uint16_t *above,
|
||||
const uint16_t *left,
|
||||
const int bs, const int bd,
|
||||
const __m128i *prm, int meanValue,
|
||||
uint16_t *dst,
|
||||
ptrdiff_t stride) {
|
||||
int pred[33][65];
|
||||
int r, c, colBound;
|
||||
int remainings;
|
||||
int ipred;
|
||||
|
||||
for (r = 0; r < bs; ++r) {
|
||||
pred[r + 1][0] = (int)left[r] - meanValue;
|
||||
}
|
||||
|
||||
above -= 1;
|
||||
for (c = 0; c < 2 * bs + 1; ++c) {
|
||||
pred[0][c] = (int)above[c] - meanValue;
|
||||
}
|
||||
|
||||
r = 0;
|
||||
c = 0;
|
||||
while (r < bs) {
|
||||
colBound = (bs << 1) - r;
|
||||
for (c = 0; c < colBound; c += 4) {
|
||||
remainings = colBound - c + 1;
|
||||
ProducePixels(&pred[r][c], prm, remainings);
|
||||
}
|
||||
r += 1;
|
||||
}
|
||||
|
||||
for (r = 0; r < bs; ++r) {
|
||||
for (c = 0; c < bs; ++c) {
|
||||
ipred = pred[r + 1][c + 1] + meanValue;
|
||||
dst[c] = clip_pixel_highbd(ipred, bd);
|
||||
}
|
||||
dst += stride;
|
||||
}
|
||||
}
|
||||
|
||||
static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
|
||||
int bs, const int bd, __m128i *prm,
|
||||
uint16_t *dst, ptrdiff_t stride) {
|
||||
int meanValue = 0;
|
||||
meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
|
||||
HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, DC_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, V_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, H_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, D45_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, D135_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, D117_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, D153_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, D207_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, D63_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
|
||||
void vp10_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
|
||||
int bs, const uint16_t *above,
|
||||
const uint16_t *left, int bd) {
|
||||
__m128i prm[5];
|
||||
GetIntraFilterParams(bs, TM_PRED, &prm[0]);
|
||||
HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
Reference in New Issue
Block a user