diff --git a/src/dsp/dec_mips_dsp_r2.c b/src/dsp/dec_mips_dsp_r2.c
index b4651fac..dca15eb5 100644
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@@ -23,53 +23,11 @@ static const int kC2 = 35468;
 
 #define MUL(a, b) (((a) * (b)) >> 16)
 
-// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
-// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
-// temp1..temp7 same as temp0
-// precrqu_s.qb.ph temp0, temp1, temp0:
-//   temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
-// store temp0 to dst
-// IO - input/output
-// I - input (macro doesn't change it)
-#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7,               \
-                         I0, I1, I2, I3, I4, I5, I6, I7)                       \
-  "addq.ph          %["#IO0"],  %["#IO0"],  %["#I0"]          \n\t"            \
-  "addq.ph          %["#IO1"],  %["#IO1"],  %["#I1"]          \n\t"            \
-  "addq.ph          %["#IO2"],  %["#IO2"],  %["#I2"]          \n\t"            \
-  "addq.ph          %["#IO3"],  %["#IO3"],  %["#I3"]          \n\t"            \
-  "addq.ph          %["#IO4"],  %["#IO4"],  %["#I4"]          \n\t"            \
-  "addq.ph          %["#IO5"],  %["#IO5"],  %["#I5"]          \n\t"            \
-  "addq.ph          %["#IO6"],  %["#IO6"],  %["#I6"]          \n\t"            \
-  "addq.ph          %["#IO7"],  %["#IO7"],  %["#I7"]          \n\t"            \
-  "shll_s.ph        %["#IO0"],  %["#IO0"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO1"],  %["#IO1"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO2"],  %["#IO2"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO3"],  %["#IO3"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO4"],  %["#IO4"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO5"],  %["#IO5"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO6"],  %["#IO6"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO7"],  %["#IO7"],  7                 \n\t"            \
-  "precrqu_s.qb.ph  %["#IO0"],  %["#IO1"],  %["#IO0"]         \n\t"            \
-  "precrqu_s.qb.ph  %["#IO2"],  %["#IO3"],  %["#IO2"]         \n\t"            \
-  "precrqu_s.qb.ph  %["#IO4"],  %["#IO5"],  %["#IO4"]         \n\t"            \
-  "precrqu_s.qb.ph  %["#IO6"],  %["#IO7"],  %["#IO6"]         \n\t"            \
-  "usw              %["#IO0"],  0(%[dst])                     \n\t"            \
-  "usw              %["#IO2"],  32(%[dst])                    \n\t"            \
-  "usw              %["#IO4"],  64(%[dst])                    \n\t"            \
-  "usw              %["#IO6"],  96(%[dst])                    \n\t"
-
-// O - output
-#define LOAD_DST(O0, O1, O2, O3)                                               \
-  "ulw              %["#O0"],  0(%[dst])                      \n\t"            \
-  "ulw              %["#O1"],  32(%[dst])                     \n\t"            \
-  "ulw              %["#O2"],  64(%[dst])                     \n\t"            \
-  "ulw              %["#O3"],  96(%[dst])                     \n\t"
-
 static void TransformDC(const int16_t* in, uint8_t* dst) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
 
   __asm__ volatile (
-    LOAD_DST(temp1, temp2, temp3, temp4)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst, 0, 32, 64, 96)
     "lh               %[temp5],  0(%[in])               \n\t"
     "addiu            %[temp5],  %[temp5],  4           \n\t"
     "ins              %[temp5],  %[temp5],  16, 16      \n\t"
@@ -77,7 +35,8 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
     CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
                             temp3, temp1, temp2, temp3, temp4)
     STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
-                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5)
+                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
+                     dst, 0, 32, 64, 96)
 
     OUTPUT_EARLY_CLOBBER_REGS_10()
     : [in]"r"(in), [dst]"r"(dst)
@@ -102,14 +61,14 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
     "replv.ph         %[temp5],   %[c1]                      \n\t"
     SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
                    temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
-    LOAD_DST(temp3, temp5, temp11, temp12)
+    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst, 0, 32, 64, 96)
     CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
                             temp11, temp17, temp3, temp5, temp11, temp12)
     PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
                           temp4, temp7, temp6, temp10, temp9)
     STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
                      temp17, temp12, temp18, temp1, temp8, temp2, temp4,
-                     temp7, temp6)
+                     temp7, temp6, dst, 0, 32, 64, 96)
 
     OUTPUT_EARLY_CLOBBER_REGS_18(),
       [c4]"+&r"(c4)
@@ -169,11 +128,12 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
                    temp6)
     PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
                           temp16, temp11, temp10, temp15, temp14)
-    LOAD_DST(temp10, temp11, temp14, temp15)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst, 0, 32, 64, 96)
     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
                             temp11, temp10, temp11, temp14, temp15)
     STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
-                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4)
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 32, 64, 96)
 
     OUTPUT_EARLY_CLOBBER_REGS_18()
     : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
@@ -510,8 +470,6 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 
-#undef LOAD_DST
-#undef STORE_SAT_SUM_X2
 #undef MUL
 
 #endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/dsp/enc_mips_dsp_r2.c b/src/dsp/enc_mips_dsp_r2.c
index 8a24a6ea..843d81e4 100644
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@@ -23,46 +23,54 @@
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 
-#define LOAD_REF(O0, O1, O2, O3)                                               \
-  "ulw              %["#O0"],  0(%[ref])                      \n\t"            \
-  "ulw              %["#O1"],  16(%[ref])                     \n\t"            \
-  "ulw              %["#O2"],  32(%[ref])                     \n\t"            \
-  "ulw              %["#O3"],  48(%[ref])                     \n\t"
-
-// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
-// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
-// temp1..temp7 same as temp0
-// precrqu_s.qb.ph temp0, temp1, temp0:
-//   temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
-// store temp0 to dst
-// IO - input/output
+// O - output
 // I - input (macro doesn't change it)
-#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7,               \
-                         I0, I1, I2, I3, I4, I5, I6, I7)                       \
-  "addq.ph          %["#IO0"],  %["#IO0"],  %["#I0"]          \n\t"            \
-  "addq.ph          %["#IO1"],  %["#IO1"],  %["#I1"]          \n\t"            \
-  "addq.ph          %["#IO2"],  %["#IO2"],  %["#I2"]          \n\t"            \
-  "addq.ph          %["#IO3"],  %["#IO3"],  %["#I3"]          \n\t"            \
-  "addq.ph          %["#IO4"],  %["#IO4"],  %["#I4"]          \n\t"            \
-  "addq.ph          %["#IO5"],  %["#IO5"],  %["#I5"]          \n\t"            \
-  "addq.ph          %["#IO6"],  %["#IO6"],  %["#I6"]          \n\t"            \
-  "addq.ph          %["#IO7"],  %["#IO7"],  %["#I7"]          \n\t"            \
-  "shll_s.ph        %["#IO0"],  %["#IO0"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO1"],  %["#IO1"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO2"],  %["#IO2"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO3"],  %["#IO3"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO4"],  %["#IO4"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO5"],  %["#IO5"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO6"],  %["#IO6"],  7                 \n\t"            \
-  "shll_s.ph        %["#IO7"],  %["#IO7"],  7                 \n\t"            \
-  "precrqu_s.qb.ph  %["#IO0"],  %["#IO1"],  %["#IO0"]         \n\t"            \
-  "precrqu_s.qb.ph  %["#IO2"],  %["#IO3"],  %["#IO2"]         \n\t"            \
-  "precrqu_s.qb.ph  %["#IO4"],  %["#IO5"],  %["#IO4"]         \n\t"            \
-  "precrqu_s.qb.ph  %["#IO6"],  %["#IO7"],  %["#IO6"]         \n\t"            \
-  "usw              %["#IO0"],  0(%[dst])                     \n\t"            \
-  "usw              %["#IO2"],  16(%[dst])                    \n\t"            \
-  "usw              %["#IO4"],  32(%[dst])                    \n\t"            \
-  "usw              %["#IO6"],  48(%[dst])                    \n\t"
+#define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
+                          I0, I1, I2, I3, I4, I5, I6, I7)                      \
+  "addq.ph          %["#O0"],   %["#I0"],  %["#I1"]           \n\t"            \
+  "subq.ph          %["#O1"],   %["#I0"],  %["#I1"]           \n\t"            \
+  "addq.ph          %["#O2"],   %["#I2"],  %["#I3"]           \n\t"            \
+  "subq.ph          %["#O3"],   %["#I2"],  %["#I3"]           \n\t"            \
+  "addq.ph          %["#O4"],   %["#I4"],  %["#I5"]           \n\t"            \
+  "subq.ph          %["#O5"],   %["#I4"],  %["#I5"]           \n\t"            \
+  "addq.ph          %["#O6"],   %["#I6"],  %["#I7"]           \n\t"            \
+  "subq.ph          %["#O7"],   %["#I6"],  %["#I7"]           \n\t"
+
+// IO - input/output
+#define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
+  "absq_s.ph        %["#IO0"],   %["#IO0"]                    \n\t"            \
+  "absq_s.ph        %["#IO1"],   %["#IO1"]                    \n\t"            \
+  "absq_s.ph        %["#IO2"],   %["#IO2"]                    \n\t"            \
+  "absq_s.ph        %["#IO3"],   %["#IO3"]                    \n\t"            \
+  "absq_s.ph        %["#IO4"],   %["#IO4"]                    \n\t"            \
+  "absq_s.ph        %["#IO5"],   %["#IO5"]                    \n\t"            \
+  "absq_s.ph        %["#IO6"],   %["#IO6"]                    \n\t"            \
+  "absq_s.ph        %["#IO7"],   %["#IO7"]                    \n\t"
+
+// dpa.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
+// dpax.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
+// O - output
+// I - input (macro doesn't change it)
+#define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
+                 I8, I9, I10, I11, I12, I13, I14, I15)                         \
+    "mult            $ac0,      $zero,     $zero              \n\t"            \
+    "dpa.w.ph        $ac0,      %["#I2"],  %["#I0"]           \n\t"            \
+    "dpax.w.ph       $ac0,      %["#I5"],  %["#I6"]           \n\t"            \
+    "dpa.w.ph        $ac0,      %["#I8"],  %["#I9"]           \n\t"            \
+    "dpax.w.ph       $ac0,      %["#I11"], %["#I4"]           \n\t"            \
+    "dpa.w.ph        $ac0,      %["#I12"], %["#I7"]           \n\t"            \
+    "dpax.w.ph       $ac0,      %["#I13"], %["#I1"]           \n\t"            \
+    "dpa.w.ph        $ac0,      %["#I14"], %["#I3"]           \n\t"            \
+    "dpax.w.ph       $ac0,      %["#I15"], %["#I10"]          \n\t"            \
+    "mflo            %["#O0"],  $ac0                          \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
+  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
+  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
+  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
+  [temp17]"=&r"(temp17)
 
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
                                       uint8_t* dst) {
@@ -116,11 +124,12 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
                    temp6)
     PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
                           temp16, temp11, temp10, temp15, temp14)
-    LOAD_REF(temp10, temp11, temp14, temp15)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref, 0, 16, 32, 48)
     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
                             temp11, temp10, temp11, temp14, temp15)
     STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
-                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4)
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 16, 32, 48)
 
     OUTPUT_EARLY_CLOBBER_REGS_18()
     : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
@@ -136,8 +145,71 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   }
 }
 
-#undef LOAD_REF
-#undef STORE_SAT_SUM_X2
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a, 0, 16, 32, 48)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, 0, 4, 8, 12)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, 16, 20, 24, 28)
+    MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b, 0, 16, 32, 48)
+    CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w, 0, 4, 8, 12)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w, 16, 20, 24, 28)
+    MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    OUTPUT_EARLY_CLOBBER_REGS_17()
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w)
+    : "memory", "hi", "lo"
+  );
+  return abs(temp3 - temp17) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+#undef OUTPUT_EARLY_CLOBBER_REGS_17
+#undef MUL_HALF
+#undef ABS_X8
+#undef ADD_SUB_HALVES_X4
 
 #endif  // WEBP_USE_MIPS_DSP_R2
 
@@ -149,5 +221,7 @@ extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
 #if defined(WEBP_USE_MIPS_DSP_R2)
   VP8ITransform = ITransform;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
 #endif  // WEBP_USE_MIPS_DSP_R2
 }
diff --git a/src/dsp/mips_macro.h b/src/dsp/mips_macro.h
index 81f38fde..6d907afd 100644
--- a/src/dsp/mips_macro.h
+++ b/src/dsp/mips_macro.h
@@ -29,6 +29,15 @@
   "lh               %["#O0"],   "#I0"(%[in])                  \n\t"            \
   "lh               %["#O1"],   "#I1"(%[in])                  \n\t"
 
+// I0 - location
+// I1..I4 - offsets in bytes
+#define LOAD_WITH_OFFSET_X4(O0, O1, O2, O3,                                    \
+                            I0, I1, I2, I3, I4)                                \
+  "ulw    %["#O0"],    "#I1"(%["#I0"])                        \n\t"            \
+  "ulw    %["#O1"],    "#I2"(%["#I0"])                        \n\t"            \
+  "ulw    %["#O2"],    "#I3"(%["#I0"])                        \n\t"            \
+  "ulw    %["#O3"],    "#I4"(%["#I0"])                        \n\t"
+
 // O - output
 // IO - input/output
 // I - input (macro doesn't change it)
@@ -133,6 +142,42 @@
   "preceu.ph.qbr    %["#O6"],   %["#I3"]                      \n\t"            \
   "preceu.ph.qbl    %["#O7"],   %["#I3"]                      \n\t"
 
+// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
+// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
+// temp1..temp7 same as temp0
+// precrqu_s.qb.ph temp0, temp1, temp0:
+//   temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
+// store temp0 to dst
+// IO - input/output
+// I - input (macro doesn't change it)
+#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7,               \
+                         I0, I1, I2, I3, I4, I5, I6, I7,                       \
+                         I8, I9, I10, I11, I12)                                \
+  "addq.ph          %["#IO0"],  %["#IO0"],  %["#I0"]          \n\t"            \
+  "addq.ph          %["#IO1"],  %["#IO1"],  %["#I1"]          \n\t"            \
+  "addq.ph          %["#IO2"],  %["#IO2"],  %["#I2"]          \n\t"            \
+  "addq.ph          %["#IO3"],  %["#IO3"],  %["#I3"]          \n\t"            \
+  "addq.ph          %["#IO4"],  %["#IO4"],  %["#I4"]          \n\t"            \
+  "addq.ph          %["#IO5"],  %["#IO5"],  %["#I5"]          \n\t"            \
+  "addq.ph          %["#IO6"],  %["#IO6"],  %["#I6"]          \n\t"            \
+  "addq.ph          %["#IO7"],  %["#IO7"],  %["#I7"]          \n\t"            \
+  "shll_s.ph        %["#IO0"],  %["#IO0"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO1"],  %["#IO1"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO2"],  %["#IO2"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO3"],  %["#IO3"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO4"],  %["#IO4"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO5"],  %["#IO5"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO6"],  %["#IO6"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO7"],  %["#IO7"],  7                 \n\t"            \
+  "precrqu_s.qb.ph  %["#IO0"],  %["#IO1"],  %["#IO0"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO2"],  %["#IO3"],  %["#IO2"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO4"],  %["#IO5"],  %["#IO4"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO6"],  %["#IO7"],  %["#IO6"]         \n\t"            \
+  "usw              %["#IO0"],  "#I9"(%["#I8"])               \n\t"            \
+  "usw              %["#IO2"],  "#I10"(%["#I8"])              \n\t"            \
+  "usw              %["#IO4"],  "#I11"(%["#I8"])              \n\t"            \
+  "usw              %["#IO6"],  "#I12"(%["#I8"])              \n\t"
+
 #define OUTPUT_EARLY_CLOBBER_REGS_10()                                         \
   : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),             \
     [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),             \