From d586cdb4d471c37c733a6ccb9abe25678e4a6385 Mon Sep 17 00:00:00 2001
From: Linfeng Zhang <linfengz@google.com>
Date: Mon, 18 Sep 2017 09:33:31 -0700
Subject: [PATCH] Remove the unnecessary cast of (int16_t)cospi_{1...31}_64

BUG=webm:1450

Change-Id: If59743aafe99226e0ec67ab5d20678ce25f53ab8
---
 vp9/encoder/x86/vp9_dct_intrin_sse2.c | 14 +++----
 vp9/encoder/x86/vp9_dct_ssse3.c       |  2 +-
 vpx_dsp/arm/fdct_neon.c               | 14 +++----
 vpx_dsp/arm/fwd_txfm_neon.c           | 56 +++++++++++++--------------
 vpx_dsp/x86/fwd_dct32x32_impl_avx2.h  |  2 +-
 vpx_dsp/x86/fwd_dct32x32_impl_sse2.h  |  2 +-
 vpx_dsp/x86/fwd_txfm_impl_sse2.h      |  4 +-
 vpx_dsp/x86/inv_txfm_sse2.c           |  6 +--
 vpx_dsp/x86/inv_txfm_ssse3.h          |  2 +-
 9 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 969c60aba..dbd243ac1 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -72,7 +72,7 @@ static INLINE void transpose_4x4(__m128i *res) {
 }
 
 static void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -194,7 +194,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -709,7 +709,7 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
 
 static void fdct8_sse2(__m128i *in) {
   // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -861,7 +861,7 @@ static void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -1142,7 +1142,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
 static void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -1489,8 +1489,8 @@ static void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index db57ee1f1..bf874a09e 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -31,7 +31,7 @@ void vp9_fdct8x8_quant_ssse3(
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
   const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
diff --git a/vpx_dsp/arm/fdct_neon.c b/vpx_dsp/arm/fdct_neon.c
index fe78f3f51..04646ed2e 100644
--- a/vpx_dsp/arm/fdct_neon.c
+++ b/vpx_dsp/arm/fdct_neon.c
@@ -50,8 +50,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
     // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
     const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
     const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
 
     // fdct_round_shift
     int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
@@ -59,13 +59,11 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
 
     // s_3 * cospi_8_64 + s_2 * cospi_24_64
     // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64);
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
 
-    const int32x4_t temp3 =
-        vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64);
-    const int32x4_t temp4 =
-        vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64);
+    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
 
     // fdct_round_shift
     int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c
index bdee4d82a..8049277b1 100644
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -48,14 +48,14 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
     int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
     int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
     int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
     v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
     v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
     v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
@@ -77,10 +77,10 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
     // Stage 2
     v_x0 = vsubq_s16(v_s6, v_s5);
     v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -95,22 +95,22 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
       v_x3 = vaddq_s16(v_s7, cd);
     }
     // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 39d3a3f59..132e06523 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
   const __m256i k__cospi_p16_m16 =
       pair256_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index 374433390..32b9bd281 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -63,7 +63,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
diff --git a/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 743e55e63..f9abaecf2 100644
--- a/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 19bd0f12e..64fe50be1 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -246,7 +246,7 @@ void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -587,8 +587,8 @@ static void iadst16_8col(__m128i *const in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
diff --git a/vpx_dsp/x86/inv_txfm_ssse3.h b/vpx_dsp/x86/inv_txfm_ssse3.h
index d0bd5a565..e785c8eda 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3.h
+++ b/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -22,7 +22,7 @@ static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
   const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
   const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
   const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
-  const __m128i cp_16_16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
   const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
   const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));