Modify idct code to use macro

Small modification of idct code. Change-Id: I5c4e3223944c68e4ccf762f6cf07c990250e4290
2013-03-27 12:36:08 -07:00 · 2013-03-27 12:36:08 -07:00 · c6c0657c60
commit c6c0657c60
parent 0e91bec4b5
1 changed files with 159 additions and 383 deletions
--- a/vp9/common/x86/vp9_idct_x86.c
+++ b/vp9/common/x86/vp9_idct_x86.c
@ -298,129 +298,110 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \
  }

-#define IDCT8x8_1D                                             \
-  /* Stage1 */                                                 \
-  {                                                            \
-    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);        \
-    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);        \
-    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);        \
-    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);        \
-                                                               \
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);                      \
-    tmp1 = _mm_madd_epi16(hi_17, stg1_0);                      \
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);                      \
-    tmp3 = _mm_madd_epi16(hi_17, stg1_1);                      \
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);                      \
-    tmp5 = _mm_madd_epi16(hi_35, stg1_2);                      \
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);                      \
-    tmp7 = _mm_madd_epi16(hi_35, stg1_3);                      \
-                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                      \
-                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);               \
-                                                               \
-    stp1_4 = _mm_packs_epi32(tmp0, tmp1);                      \
-    stp1_7 = _mm_packs_epi32(tmp2, tmp3);                      \
-    stp1_5 = _mm_packs_epi32(tmp4, tmp5);                      \
-    stp1_6 = _mm_packs_epi32(tmp6, tmp7);                      \
-  }                                                            \
-                                                               \
-  /* Stage2 */                                                 \
-  {                                                            \
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);        \
-    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);        \
-    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);        \
-    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);        \
-                                                               \
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);                      \
-    tmp1 = _mm_madd_epi16(hi_04, stg2_0);                      \
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);                      \
-    tmp3 = _mm_madd_epi16(hi_04, stg2_1);                      \
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);                      \
-    tmp5 = _mm_madd_epi16(hi_26, stg2_2);                      \
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);                      \
-    tmp7 = _mm_madd_epi16(hi_26, stg2_3);                      \
-                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                      \
-                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);               \
-                                                               \
-    stp2_0 = _mm_packs_epi32(tmp0, tmp1);                      \
-    stp2_1 = _mm_packs_epi32(tmp2, tmp3);                      \
-    stp2_2 = _mm_packs_epi32(tmp4, tmp5);                      \
-    stp2_3 = _mm_packs_epi32(tmp6, tmp7);                      \
-                                                               \
-    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                   \
-    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                   \
-    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                   \
-    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                   \
-  }                                                            \
-                                                               \
-  /* Stage3 */                                                 \
-  {                                                            \
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);  \
-    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);  \
-                                                               \
-    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                   \
-    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                   \
-    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                   \
-    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                   \
-                                                               \
-    tmp0 = _mm_madd_epi16(lo_56, stg2_1);                      \
-    tmp1 = _mm_madd_epi16(hi_56, stg2_1);                      \
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);                      \
-    tmp3 = _mm_madd_epi16(hi_56, stg2_0);                      \
-                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                      \
-                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);               \
-                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                      \
-  }                                                            \
-                                                               \
-  /* Stage4  */                                                \
-  in0 = _mm_adds_epi16(stp1_0, stp2_7);                        \
-  in1 = _mm_adds_epi16(stp1_1, stp1_6);                        \
-  in2 = _mm_adds_epi16(stp1_2, stp1_5);                        \
-  in3 = _mm_adds_epi16(stp1_3, stp2_4);                        \
-  in4 = _mm_subs_epi16(stp1_3, stp2_4);                        \
-  in5 = _mm_subs_epi16(stp1_2, stp1_5);                        \
-  in6 = _mm_subs_epi16(stp1_1, stp1_6);                        \
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      tmp4 = _mm_madd_epi16(lo_1, cst2); \
+      tmp5 = _mm_madd_epi16(hi_1, cst2); \
+      tmp6 = _mm_madd_epi16(lo_1, cst3); \
+      tmp7 = _mm_madd_epi16(hi_1, cst3); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      tmp4 = _mm_add_epi32(tmp4, rounding); \
+      tmp5 = _mm_add_epi32(tmp5, rounding); \
+      tmp6 = _mm_add_epi32(tmp6, rounding); \
+      tmp7 = _mm_add_epi32(tmp7, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+      res2 = _mm_packs_epi32(tmp4, tmp5); \
+      res3 = _mm_packs_epi32(tmp6, tmp7); \
+  }
+
+#define IDCT8x8_1D  \
+  /* Stage1 */      \
+  { \
+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+    \
+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+                          stg1_1, stg1_2, stg1_3, stp1_4,      \
+                          stp1_7, stp1_5, stp1_6)              \
+  } \
+    \
+  /* Stage2 */ \
+  { \
+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+    \
+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+                           stg2_1, stg2_2, stg2_3, stp2_0,     \
+                           stp2_1, stp2_2, stp2_3)             \
+    \
+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+  } \
+    \
+  /* Stage3 */ \
+  { \
+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+    \
+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+    \
+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+    \
+    tmp0 = _mm_add_epi32(tmp0, rounding); \
+    tmp1 = _mm_add_epi32(tmp1, rounding); \
+    tmp2 = _mm_add_epi32(tmp2, rounding); \
+    tmp3 = _mm_add_epi32(tmp3, rounding); \
+    \
+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+    \
+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  } \
+  \
+  /* Stage4  */ \
+  in0 = _mm_adds_epi16(stp1_0, stp2_7); \
+  in1 = _mm_adds_epi16(stp1_1, stp1_6); \
+  in2 = _mm_adds_epi16(stp1_2, stp1_5); \
+  in3 = _mm_adds_epi16(stp1_3, stp2_4); \
+  in4 = _mm_subs_epi16(stp1_3, stp2_4); \
+  in5 = _mm_subs_epi16(stp1_2, stp1_5); \
+  in6 = _mm_subs_epi16(stp1_1, stp1_6); \
  in7 = _mm_subs_epi16(stp1_0, stp2_7);

 void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
@ -643,9 +624,9 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
 }

-#define IDCT16x16_1D                                       \
-  /* Stage2 */                                             \
-  {                                                        \
+#define IDCT16x16_1D \
+  /* Stage2 */ \
+  { \
    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
@ -654,250 +635,110 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
-                                            \
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); \
-    tmp1 = _mm_madd_epi16(hi_1_15, stg2_0); \
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); \
-    tmp3 = _mm_madd_epi16(hi_1_15, stg2_1); \
-    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);  \
-    tmp5 = _mm_madd_epi16(hi_9_7, stg2_2);  \
-    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);  \
-    tmp7 = _mm_madd_epi16(hi_9_7, stg2_3);  \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_8 = _mm_packs_epi32(tmp0, tmp1);  \
-    stp2_15 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_9 = _mm_packs_epi32(tmp4, tmp5);  \
-    stp2_14 = _mm_packs_epi32(tmp6, tmp7); \
-                                           \
-    tmp0 = _mm_madd_epi16(lo_5_11, stg2_4); \
-    tmp1 = _mm_madd_epi16(hi_5_11, stg2_4); \
-    tmp2 = _mm_madd_epi16(lo_5_11, stg2_5); \
-    tmp3 = _mm_madd_epi16(hi_5_11, stg2_5); \
-    tmp4 = _mm_madd_epi16(lo_13_3, stg2_6); \
-    tmp5 = _mm_madd_epi16(hi_13_3, stg2_6); \
-    tmp6 = _mm_madd_epi16(lo_13_3, stg2_7); \
-    tmp7 = _mm_madd_epi16(hi_13_3, stg2_7); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_10 = _mm_packs_epi32(tmp0, tmp1); \
-    stp2_13 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_11 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_12 = _mm_packs_epi32(tmp6, tmp7); \
-  }                                        \
-                                           \
-  /* Stage3 */                             \
-  {                                        \
+    \
+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+                           stg2_0, stg2_1, stg2_2, stg2_3, \
+                           stp2_8, stp2_15, stp2_9, stp2_14) \
+    \
+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+                           stg2_4, stg2_5, stg2_6, stg2_7, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
+  } \
+    \
+  /* Stage3 */ \
+  { \
    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
-                                            \
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); \
-    tmp1 = _mm_madd_epi16(hi_2_14, stg3_0); \
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); \
-    tmp3 = _mm_madd_epi16(hi_2_14, stg3_1); \
-    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); \
-    tmp5 = _mm_madd_epi16(hi_10_6, stg3_2); \
-    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); \
-    tmp7 = _mm_madd_epi16(hi_10_6, stg3_3); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                          \
-    stp1_4 = _mm_packs_epi32(tmp0, tmp1); \
-    stp1_7 = _mm_packs_epi32(tmp2, tmp3); \
-    stp1_5 = _mm_packs_epi32(tmp4, tmp5); \
-    stp1_6 = _mm_packs_epi32(tmp6, tmp7); \
-                                          \
+    \
+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+                           stg3_0, stg3_1, stg3_2, stg3_3, \
+                           stp1_4, stp1_7, stp1_5, stp1_6) \
+    \
    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
-                                               \
+    \
    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
-  }                                            \
-                                               \
-  /* Stage4 */                                 \
-  {                                            \
+  } \
+  \
+  /* Stage4 */ \
+  { \
    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
-                                                           \
+    \
    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
-                                           \
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); \
-    tmp1 = _mm_madd_epi16(hi_0_8, stg4_0); \
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); \
-    tmp3 = _mm_madd_epi16(hi_0_8, stg4_1); \
-    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); \
-    tmp5 = _mm_madd_epi16(hi_4_12, stg4_2); \
-    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); \
-    tmp7 = _mm_madd_epi16(hi_4_12, stg4_3); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                          \
-    stp2_0 = _mm_packs_epi32(tmp0, tmp1); \
-    stp2_1 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_2 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_3 = _mm_packs_epi32(tmp6, tmp7); \
-                                          \
+    \
+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+                           stg4_0, stg4_1, stg4_2, stg4_3, \
+                           stp2_0, stp2_1, stp2_2, stp2_3) \
+    \
    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
-                                            \
-    tmp0 = _mm_madd_epi16(lo_9_14, stg4_4); \
-    tmp1 = _mm_madd_epi16(hi_9_14, stg4_4); \
-    tmp2 = _mm_madd_epi16(lo_9_14, stg4_5); \
-    tmp3 = _mm_madd_epi16(hi_9_14, stg4_5); \
-    tmp4 = _mm_madd_epi16(lo_10_13, stg4_6); \
-    tmp5 = _mm_madd_epi16(hi_10_13, stg4_6); \
-    tmp6 = _mm_madd_epi16(lo_10_13, stg4_7); \
-    tmp7 = _mm_madd_epi16(hi_10_13, stg4_7); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                          \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_9 = _mm_packs_epi32(tmp0, tmp1);  \
-    stp2_14 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_10 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_13 = _mm_packs_epi32(tmp6, tmp7); \
-  }                                        \
-                                           \
-  /* Stage5 */                             \
-  {                                        \
+    \
+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+                           stg4_4, stg4_5, stg4_6, stg4_7, \
+                           stp2_9, stp2_14, stp2_10, stp2_13) \
+  } \
+    \
+  /* Stage5 */ \
+  { \
    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
-                                            \
+    \
    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
-                                           \
+    \
    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
-                                          \
+    \
    tmp0 = _mm_add_epi32(tmp0, rounding); \
    tmp1 = _mm_add_epi32(tmp1, rounding); \
    tmp2 = _mm_add_epi32(tmp2, rounding); \
    tmp3 = _mm_add_epi32(tmp3, rounding); \
-                                          \
+    \
    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-                                          \
+    \
    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
-                                          \
+    \
    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
-                                                 \
+    \
    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
-  }                                              \
-                                                 \
-  /* Stage6 */                                   \
-  {                                              \
+  } \
+    \
+  /* Stage6 */ \
+  { \
    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
-                                            \
+    \
    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
@ -906,38 +747,10 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
-                                             \
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); \
-    tmp1 = _mm_madd_epi16(hi_10_13, stg6_0); \
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); \
-    tmp3 = _mm_madd_epi16(hi_10_13, stg4_0); \
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); \
-    tmp5 = _mm_madd_epi16(hi_11_12, stg6_0); \
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); \
-    tmp7 = _mm_madd_epi16(hi_11_12, stg4_0); \
-                                          \
-    tmp0 = _mm_add_epi32(tmp0, rounding); \
-    tmp1 = _mm_add_epi32(tmp1, rounding); \
-    tmp2 = _mm_add_epi32(tmp2, rounding); \
-    tmp3 = _mm_add_epi32(tmp3, rounding); \
-    tmp4 = _mm_add_epi32(tmp4, rounding); \
-    tmp5 = _mm_add_epi32(tmp5, rounding); \
-    tmp6 = _mm_add_epi32(tmp6, rounding); \
-    tmp7 = _mm_add_epi32(tmp7, rounding); \
-                                                 \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-                                           \
-    stp2_10 = _mm_packs_epi32(tmp0, tmp1); \
-    stp2_13 = _mm_packs_epi32(tmp2, tmp3); \
-    stp2_11 = _mm_packs_epi32(tmp4, tmp5); \
-    stp2_12 = _mm_packs_epi32(tmp6, tmp7); \
+    \
+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                           stg6_0, stg4_0, stg6_0, stg4_0, \
+                           stp2_10, stp2_13, stp2_11, stp2_12) \
  }

 void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
@ -1507,43 +1320,6 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
  }
 }

-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
-                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
-  {   \
-      tmp0 = _mm_madd_epi16(lo_0, cst0); \
-      tmp1 = _mm_madd_epi16(hi_0, cst0); \
-      tmp2 = _mm_madd_epi16(lo_0, cst1); \
-      tmp3 = _mm_madd_epi16(hi_0, cst1); \
-      tmp4 = _mm_madd_epi16(lo_1, cst2); \
-      tmp5 = _mm_madd_epi16(hi_1, cst2); \
-      tmp6 = _mm_madd_epi16(lo_1, cst3); \
-      tmp7 = _mm_madd_epi16(hi_1, cst3); \
-      \
-      tmp0 = _mm_add_epi32(tmp0, rounding); \
-      tmp1 = _mm_add_epi32(tmp1, rounding); \
-      tmp2 = _mm_add_epi32(tmp2, rounding); \
-      tmp3 = _mm_add_epi32(tmp3, rounding); \
-      tmp4 = _mm_add_epi32(tmp4, rounding); \
-      tmp5 = _mm_add_epi32(tmp5, rounding); \
-      tmp6 = _mm_add_epi32(tmp6, rounding); \
-      tmp7 = _mm_add_epi32(tmp7, rounding); \
-      \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
-      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
-      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
-      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
-      \
-      res0 = _mm_packs_epi32(tmp0, tmp1); \
-      res1 = _mm_packs_epi32(tmp2, tmp3); \
-      res2 = _mm_packs_epi32(tmp4, tmp5); \
-      res3 = _mm_packs_epi32(tmp6, tmp7); \
-  }
-
 void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
  const int half_pitch = pitch >> 1;
  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);