Fix idct8x8 SSSE3 SingleExtremeCoeff unit tests
- In SSSE3 optimization, 16-bit addition and subtraction would overflow when input coefficient is 16-bit signed extreme values. - Function-level speed becomes slower (unit ms): idct8x8_64: 284 -> 294 idct8x8_12: 145 -> 158. BUG=webm:1332 Change-Id: I1e4bf9d30a6d4112b8cac5823729565bf145e40b
This commit is contained in:
parent
3e7025022e
commit
1f8e8e5bf1
@ -55,35 +55,6 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc,
|
||||
const int kMaxNumCoeffs = 1024;
|
||||
const int kCountTestBlock = 1000;
|
||||
|
||||
// https://bugs.chromium.org/p/webm/issues/detail?id=1332
|
||||
// The functions specified do not pass with INT16_MIN/MAX. They fail at the
|
||||
// value specified, but pass when 1 is added/subtracted.
|
||||
int16_t MaxSupportedCoeff(InvTxfmWithBdFunc a) {
|
||||
#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE && \
|
||||
!CONFIG_VP9_HIGHBITDEPTH
|
||||
if (a == &wrapper<vpx_idct8x8_64_add_ssse3> ||
|
||||
a == &wrapper<vpx_idct8x8_12_add_ssse3>) {
|
||||
return 23625 - 1;
|
||||
}
|
||||
#else
|
||||
(void)a;
|
||||
#endif
|
||||
return std::numeric_limits<int16_t>::max();
|
||||
}
|
||||
|
||||
int16_t MinSupportedCoeff(InvTxfmWithBdFunc a) {
|
||||
#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE && \
|
||||
!CONFIG_VP9_HIGHBITDEPTH
|
||||
if (a == &wrapper<vpx_idct8x8_64_add_ssse3> ||
|
||||
a == &wrapper<vpx_idct8x8_12_add_ssse3>) {
|
||||
return -23625 + 1;
|
||||
}
|
||||
#else
|
||||
(void)a;
|
||||
#endif
|
||||
return std::numeric_limits<int16_t>::min();
|
||||
}
|
||||
|
||||
class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
|
||||
public:
|
||||
virtual ~PartialIDctTest() {}
|
||||
@ -261,8 +232,8 @@ TEST_P(PartialIDctTest, AddOutputBlock) {
|
||||
}
|
||||
|
||||
TEST_P(PartialIDctTest, SingleExtremeCoeff) {
|
||||
const int16_t max_coeff = MaxSupportedCoeff(partial_itxfm_);
|
||||
const int16_t min_coeff = MinSupportedCoeff(partial_itxfm_);
|
||||
const int16_t max_coeff = std::numeric_limits<int16_t>::max();
|
||||
const int16_t min_coeff = std::numeric_limits<int16_t>::min();
|
||||
for (int i = 0; i < last_nonzero_; ++i) {
|
||||
memset(input_block_, 0, sizeof(*input_block_) * input_block_size_);
|
||||
// Run once for min and once for max.
|
||||
@ -285,7 +256,7 @@ TEST_P(PartialIDctTest, SingleExtremeCoeff) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(PartialIDctTest, Speed) {
|
||||
TEST_P(PartialIDctTest, DISABLED_Speed) {
|
||||
// Keep runtime stable with transform size.
|
||||
const int kCountSpeedTestBlock = 500000000 / input_block_size_;
|
||||
InitMem();
|
||||
|
@ -23,7 +23,8 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
|
||||
const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
|
||||
const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
|
||||
const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
|
||||
const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
|
||||
const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
|
||||
const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
|
||||
const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
|
||||
|
||||
@ -99,10 +100,26 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
|
||||
|
||||
{
|
||||
tmp0 = _mm_add_epi16(in0, in4);
|
||||
tmp1 = _mm_sub_epi16(in0, in4);
|
||||
stp2_0 = _mm_mulhrs_epi16(tmp0, stg2_0);
|
||||
stp2_1 = _mm_mulhrs_epi16(tmp1, stg2_0);
|
||||
tmp0 = _mm_unpacklo_epi16(in0, in4);
|
||||
tmp1 = _mm_unpackhi_epi16(in0, in4);
|
||||
|
||||
tmp2 = _mm_madd_epi16(tmp0, stk2_0);
|
||||
tmp3 = _mm_madd_epi16(tmp1, stk2_0);
|
||||
tmp4 = _mm_madd_epi16(tmp0, stk2_1);
|
||||
tmp5 = _mm_madd_epi16(tmp1, stk2_1);
|
||||
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp3 = _mm_add_epi32(tmp3, rounding);
|
||||
tmp4 = _mm_add_epi32(tmp4, rounding);
|
||||
tmp5 = _mm_add_epi32(tmp5, rounding);
|
||||
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
|
||||
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
|
||||
tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
|
||||
|
||||
stp2_0 = _mm_packs_epi32(tmp2, tmp3);
|
||||
stp2_1 = _mm_packs_epi32(tmp4, tmp5);
|
||||
|
||||
tmp0 = _mm_madd_epi16(lo_26, stg2_2);
|
||||
tmp1 = _mm_madd_epi16(hi_26, stg2_2);
|
||||
@ -136,10 +153,26 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
|
||||
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
|
||||
|
||||
tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
|
||||
tmp2 = _mm_add_epi16(stp2_6, stp2_5);
|
||||
stp1_5 = _mm_mulhrs_epi16(tmp0, stg2_0);
|
||||
stp1_6 = _mm_mulhrs_epi16(tmp2, stg2_0);
|
||||
tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
|
||||
tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
|
||||
|
||||
tmp2 = _mm_madd_epi16(tmp0, stk2_1);
|
||||
tmp3 = _mm_madd_epi16(tmp1, stk2_1);
|
||||
tmp4 = _mm_madd_epi16(tmp0, stk2_0);
|
||||
tmp5 = _mm_madd_epi16(tmp1, stk2_0);
|
||||
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp3 = _mm_add_epi32(tmp3, rounding);
|
||||
tmp4 = _mm_add_epi32(tmp4, rounding);
|
||||
tmp5 = _mm_add_epi32(tmp5, rounding);
|
||||
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
|
||||
tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
|
||||
tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
|
||||
|
||||
stp1_5 = _mm_packs_epi32(tmp2, tmp3);
|
||||
stp1_6 = _mm_packs_epi32(tmp4, tmp5);
|
||||
}
|
||||
|
||||
/* Stage4 */
|
||||
@ -186,14 +219,18 @@ void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
int stride) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
|
||||
const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
|
||||
const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
|
||||
const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
|
||||
const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
|
||||
const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
|
||||
const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
|
||||
const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
|
||||
const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
|
||||
const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
|
||||
const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
|
||||
|
||||
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
|
||||
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
|
||||
@ -233,6 +270,17 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
|
||||
stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
|
||||
|
||||
tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
|
||||
tmp1 = _mm_madd_epi16(tmp0, stg3_0);
|
||||
tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0
|
||||
|
||||
tmp1 = _mm_add_epi32(tmp1, rounding);
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
|
||||
stp1_5 = _mm_packs_epi32(tmp1, tmp2);
|
||||
|
||||
// Stage3
|
||||
tmp2 = _mm_add_epi16(stp2_0, stp2_2);
|
||||
tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
|
||||
@ -240,13 +288,6 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
|
||||
stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
|
||||
|
||||
tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
|
||||
tmp1 = _mm_add_epi16(stp2_6, stp2_5);
|
||||
|
||||
tmp2 = _mm_mulhrs_epi16(tmp0, stg2_0);
|
||||
tmp3 = _mm_mulhrs_epi16(tmp1, stg2_0);
|
||||
stp1_5 = _mm_unpacklo_epi64(tmp2, tmp3);
|
||||
|
||||
// Stage4
|
||||
tmp0 = _mm_add_epi16(stp1_3, stp2_4);
|
||||
tmp1 = _mm_add_epi16(stp1_2, stp1_5);
|
||||
@ -279,10 +320,24 @@ void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
|
||||
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
|
||||
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
|
||||
|
||||
tmp0 = _mm_add_epi16(stp2_6, stp2_5);
|
||||
tmp1 = _mm_sub_epi16(stp2_6, stp2_5);
|
||||
stp1_6 = _mm_mulhrs_epi16(tmp0, stg2_0);
|
||||
stp1_5 = _mm_mulhrs_epi16(tmp1, stg2_0);
|
||||
tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
|
||||
tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
|
||||
|
||||
tmp2 = _mm_madd_epi16(tmp0, stk2_0);
|
||||
tmp3 = _mm_madd_epi16(tmp1, stk2_0);
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp3 = _mm_add_epi32(tmp3, rounding);
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
|
||||
stp1_6 = _mm_packs_epi32(tmp2, tmp3);
|
||||
|
||||
tmp2 = _mm_madd_epi16(tmp0, stk2_1);
|
||||
tmp3 = _mm_madd_epi16(tmp1, stk2_1);
|
||||
tmp2 = _mm_add_epi32(tmp2, rounding);
|
||||
tmp3 = _mm_add_epi32(tmp3, rounding);
|
||||
tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
|
||||
tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
|
||||
stp1_5 = _mm_packs_epi32(tmp2, tmp3);
|
||||
|
||||
/* Stage4 */
|
||||
in0 = _mm_add_epi16(stp1_0, stp2_7);
|
||||
|
Loading…
x
Reference in New Issue
Block a user