From ab0e7a237a6e9796c15a8858caac04dea3593d62 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 2 Aug 2016 15:59:35 -0700 Subject: [PATCH] Use shifted value for sinpi8sqrt2 The value 35468 changes sign when stored in int16_t: implicit conversion from 'int' to 'int16_t' (aka 'short') changes value from 35468 to -30068 This negation requires adding back the original value to compensate. Shifting the value keeps the value positive and saves a post-vqdmulh shift. This technique is used in webp and idct_dequant_full_2x_neon BUG=b/28027557 Change-Id: I0c5ce09bea170fe08061856c2af6f841a557e0c3 --- test/idct_test.cc | 4 ++++ vp8/common/arm/neon/dequant_idct_neon.c | 10 +++++----- vp8/common/arm/neon/shortidct4x4llm_neon.c | 10 +++++----- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/test/idct_test.cc b/test/idct_test.cc index f54f2c005..700da77e3 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -115,6 +115,10 @@ TEST_P(IDCTTest, TestWithData) { } INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c)); +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_neon)); +#endif #if HAVE_MMX INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_mmx)); diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c index ff5981eaa..753051c77 100644 --- a/vp8/common/arm/neon/dequant_idct_neon.c +++ b/vp8/common/arm/neon/dequant_idct_neon.c @@ -11,7 +11,11 @@ #include static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 35468; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, int stride) { @@ -60,10 +64,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); - q3 = vshrq_n_s16(q3, 1); q4 = vshrq_n_s16(q4, 1); - q3 = vqaddq_s16(q3, q2); q4 = vqaddq_s16(q4, q2); d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); @@ -90,10 +92,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); - q3 = vshrq_n_s16(q3, 1); q4 = vshrq_n_s16(q4, 1); - q3 = vqaddq_s16(q3, q2); q4 = vqaddq_s16(q4, q2); d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.c b/vp8/common/arm/neon/shortidct4x4llm_neon.c index a36c0c1ca..1adb1c317 100644 --- a/vp8/common/arm/neon/shortidct4x4llm_neon.c +++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -11,7 +11,11 @@ #include static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 35468; +// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of +// the way it is used in vqdmulh, where the result is doubled, it can be divided +// by 2 beforehand. This saves compensating for the negative value as well as +// shifting the result. +static const int16_t sinpi8sqrt2 = 35468 >> 1; void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, @@ -40,10 +44,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 - q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); - q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 @@ -71,10 +73,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 - q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); - q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1