Merge "Integrate HBD row/column flip fwd txfm SSE4.1 optimization" into nextgenv2
This commit is contained in:
@@ -207,7 +207,19 @@ const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
|
|||||||
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 10),
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 10),
|
||||||
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
|
||||||
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
|
||||||
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12)
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 12),
|
||||||
|
#if CONFIG_EXT_TX
|
||||||
|
#endif // CONFIG_EXT_TX
|
||||||
};
|
};
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
SSE4_1, VP10HighbdTrans16x16HT,
|
SSE4_1, VP10HighbdTrans16x16HT,
|
||||||
|
@@ -39,7 +39,9 @@ typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
|
|||||||
int tx_type, int bd);
|
int tx_type, int bd);
|
||||||
typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
|
typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
|
||||||
int tx_type, int bd);
|
int tx_type, int bd);
|
||||||
// Target optimized function, tx_type, bit depth
|
|
||||||
|
// HighbdHt4x4Param argument list:
|
||||||
|
// <Target optimized function, tx_type, bit depth>
|
||||||
typedef tuple<HBDFhtFunc, int, int> HighbdHt4x4Param;
|
typedef tuple<HBDFhtFunc, int, int> HighbdHt4x4Param;
|
||||||
|
|
||||||
void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride,
|
void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride,
|
||||||
@@ -96,12 +98,12 @@ class VP10HighbdTrans4x4HT : public ::testing::TestWithParam<HighbdHt4x4Param> {
|
|||||||
mask_ = (1 << bit_depth_) - 1;
|
mask_ = (1 << bit_depth_) - 1;
|
||||||
num_coeffs_ = 16;
|
num_coeffs_ = 16;
|
||||||
|
|
||||||
input_ = reinterpret_cast<int16_t *>
|
input_ = reinterpret_cast<int16_t *>(
|
||||||
(vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
|
vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
|
||||||
output_ = reinterpret_cast<int32_t *>
|
output_ = reinterpret_cast<int32_t *>(
|
||||||
(vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
|
vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
|
||||||
output_ref_ = reinterpret_cast<int32_t *>
|
output_ref_ = reinterpret_cast<int32_t *>(
|
||||||
(vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
|
vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void TearDown() {
|
virtual void TearDown() {
|
||||||
@@ -197,9 +199,7 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
#endif // HAVE_SSE2
|
#endif // HAVE_SSE2
|
||||||
|
|
||||||
#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
|
#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
|
||||||
INSTANTIATE_TEST_CASE_P(
|
const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
|
||||||
SSE4_1, VP10HighbdTrans4x4HT,
|
|
||||||
::testing::Values(
|
|
||||||
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 10),
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 10),
|
||||||
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 12),
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 12),
|
||||||
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 10),
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 10),
|
||||||
@@ -207,7 +207,25 @@ INSTANTIATE_TEST_CASE_P(
|
|||||||
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 10),
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 10),
|
||||||
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 12),
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 12),
|
||||||
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 10),
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 10),
|
||||||
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12)));
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12),
|
||||||
|
#if CONFIG_EXT_TX
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 12),
|
||||||
|
#endif // CONFIG_EXT_TX
|
||||||
|
};
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(
|
||||||
|
SSE4_1, VP10HighbdTrans4x4HT,
|
||||||
|
::testing::ValuesIn(kArrayHighbdHt4x4Param));
|
||||||
|
|
||||||
#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
|
#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
@@ -207,7 +207,19 @@ const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = {
|
|||||||
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 10),
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 10),
|
||||||
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 12),
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 12),
|
||||||
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 10),
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 10),
|
||||||
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12)
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12),
|
||||||
|
#if CONFIG_EXT_TX
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 12),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 10),
|
||||||
|
make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 12),
|
||||||
|
#endif // CONFIG_EXT_TX
|
||||||
};
|
};
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
SSE4_1, VP10HighbdTrans8x8HT,
|
SSE4_1, VP10HighbdTrans8x8HT,
|
||||||
|
@@ -195,7 +195,7 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
|
|||||||
case FLIPADST_FLIPADST:
|
case FLIPADST_FLIPADST:
|
||||||
case ADST_FLIPADST:
|
case ADST_FLIPADST:
|
||||||
case FLIPADST_ADST:
|
case FLIPADST_ADST:
|
||||||
vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
|
vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
|
||||||
break;
|
break;
|
||||||
case V_DCT:
|
case V_DCT:
|
||||||
case H_DCT:
|
case H_DCT:
|
||||||
@@ -211,7 +211,6 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
|
|||||||
#endif // CONFIG_EXT_TX
|
#endif // CONFIG_EXT_TX
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -232,7 +231,7 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
|
|||||||
case FLIPADST_FLIPADST:
|
case FLIPADST_FLIPADST:
|
||||||
case ADST_FLIPADST:
|
case ADST_FLIPADST:
|
||||||
case FLIPADST_ADST:
|
case FLIPADST_ADST:
|
||||||
vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
|
vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
|
||||||
break;
|
break;
|
||||||
case V_DCT:
|
case V_DCT:
|
||||||
case H_DCT:
|
case H_DCT:
|
||||||
@@ -249,7 +248,6 @@ static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
|
|||||||
#endif // CONFIG_EXT_TX
|
#endif // CONFIG_EXT_TX
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -270,7 +268,7 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
|
|||||||
case FLIPADST_FLIPADST:
|
case FLIPADST_FLIPADST:
|
||||||
case ADST_FLIPADST:
|
case ADST_FLIPADST:
|
||||||
case FLIPADST_ADST:
|
case FLIPADST_ADST:
|
||||||
vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
|
vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
|
||||||
break;
|
break;
|
||||||
case V_DCT:
|
case V_DCT:
|
||||||
case H_DCT:
|
case H_DCT:
|
||||||
@@ -287,7 +285,6 @@ static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
|
|||||||
#endif // CONFIG_EXT_TX
|
#endif // CONFIG_EXT_TX
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -239,6 +239,43 @@ void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
|
|||||||
fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||||
write_buffer_4x4(in, coeff);
|
write_buffer_4x4(in, coeff);
|
||||||
break;
|
break;
|
||||||
|
#if CONFIG_EXT_TX
|
||||||
|
case FLIPADST_DCT:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_dct_4;
|
||||||
|
load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||||
|
fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||||
|
write_buffer_4x4(in, coeff);
|
||||||
|
break;
|
||||||
|
case DCT_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_dct_adst_4;
|
||||||
|
load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
|
||||||
|
fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||||
|
write_buffer_4x4(in, coeff);
|
||||||
|
break;
|
||||||
|
case FLIPADST_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_4;
|
||||||
|
load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||||
|
write_buffer_4x4(in, coeff);
|
||||||
|
break;
|
||||||
|
case ADST_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_4;
|
||||||
|
load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||||
|
write_buffer_4x4(in, coeff);
|
||||||
|
break;
|
||||||
|
case FLIPADST_ADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_4;
|
||||||
|
load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
|
||||||
|
fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
|
||||||
|
write_buffer_4x4(in, coeff);
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@@ -960,6 +997,56 @@ void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff,
|
|||||||
transpose_8x8(out, in);
|
transpose_8x8(out, in);
|
||||||
write_buffer_8x8(in, coeff);
|
write_buffer_8x8(in, coeff);
|
||||||
break;
|
break;
|
||||||
|
case FLIPADST_DCT:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_dct_8;
|
||||||
|
load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
|
||||||
|
col_txfm_8x8_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
write_buffer_8x8(in, coeff);
|
||||||
|
break;
|
||||||
|
case DCT_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_dct_adst_8;
|
||||||
|
load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
|
||||||
|
fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
|
||||||
|
col_txfm_8x8_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
write_buffer_8x8(in, coeff);
|
||||||
|
break;
|
||||||
|
case FLIPADST_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_8;
|
||||||
|
load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
|
||||||
|
col_txfm_8x8_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
write_buffer_8x8(in, coeff);
|
||||||
|
break;
|
||||||
|
case ADST_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_8;
|
||||||
|
load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
|
||||||
|
col_txfm_8x8_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
write_buffer_8x8(in, coeff);
|
||||||
|
break;
|
||||||
|
case FLIPADST_ADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_8;
|
||||||
|
load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
|
||||||
|
col_txfm_8x8_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
|
||||||
|
transpose_8x8(out, in);
|
||||||
|
write_buffer_8x8(in, coeff);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@@ -1806,6 +1893,56 @@ void vp10_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
|
|||||||
transpose_16x16(out, in);
|
transpose_16x16(out, in);
|
||||||
write_buffer_16x16(in, coeff);
|
write_buffer_16x16(in, coeff);
|
||||||
break;
|
break;
|
||||||
|
case FLIPADST_DCT:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_dct_16;
|
||||||
|
load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
|
||||||
|
col_txfm_16x16_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
write_buffer_16x16(in, coeff);
|
||||||
|
break;
|
||||||
|
case DCT_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_dct_adst_16;
|
||||||
|
load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
|
||||||
|
fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
|
||||||
|
col_txfm_16x16_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
write_buffer_16x16(in, coeff);
|
||||||
|
break;
|
||||||
|
case FLIPADST_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_16;
|
||||||
|
load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
|
||||||
|
col_txfm_16x16_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
write_buffer_16x16(in, coeff);
|
||||||
|
break;
|
||||||
|
case ADST_FLIPADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_16;
|
||||||
|
load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
|
||||||
|
col_txfm_16x16_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
write_buffer_16x16(in, coeff);
|
||||||
|
break;
|
||||||
|
case FLIPADST_ADST:
|
||||||
|
cfg = &fwd_txfm_2d_cfg_adst_adst_16;
|
||||||
|
load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
|
||||||
|
col_txfm_16x16_rounding(out, -cfg->shift[1]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
|
||||||
|
transpose_16x16(out, in);
|
||||||
|
write_buffer_16x16(in, coeff);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user