MIPS: dspr2: added optimization for ITransform

Change-Id: I3534fca143535c53d18a3749b3a1b0c8a7563463
2014-10-10 12:31:00 +02:00
parent c94ed49efd
commit 829a8c19a0
8 changed files with 318 additions and 125 deletions
--- a/Android.mk
+++ b/Android.mk
@@ -45,6 +45,7 @@ LOCAL_SRC_FILES := \
    src/dsp/enc.c \
    src/dsp/enc_avx2.c \
    src/dsp/enc_mips32.c \
+    src/dsp/enc_mips_dsp_r2.c \
    src/dsp/enc_neon.$(NEON) \
    src/dsp/enc_sse2.c \
    src/dsp/filters_mips_dsp_r2.c \
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -206,6 +206,7 @@ DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\enc.obj \
    $(DIROBJ)\dsp\enc_avx2.obj \
    $(DIROBJ)\dsp\enc_mips32.obj \
+    $(DIROBJ)\dsp\enc_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\enc_neon.obj \
    $(DIROBJ)\dsp\enc_sse2.obj \

--- a/makefile.unix
+++ b/makefile.unix
@@ -136,6 +136,7 @@ DSP_ENC_OBJS = \
    src/dsp/enc.o \
    src/dsp/enc_avx2.o \
    src/dsp/enc_mips32.o \
+    src/dsp/enc_mips_dsp_r2.o \
    src/dsp/enc_neon.o \
    src/dsp/enc_sse2.o \

@@ -218,6 +219,7 @@ HDRS = \
    src/dec/webpi.h \
    src/dsp/dsp.h \
    src/dsp/lossless.h \
+    src/dsp/mips_macro.h \
    src/dsp/neon.h \
    src/dsp/yuv.h \
    src/dsp/yuv_tables_sse2.h \
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -24,6 +24,7 @@ COMMON_SOURCES += lossless.h
 COMMON_SOURCES += lossless_mips32.c
 COMMON_SOURCES += lossless_mips_dsp_r2.c
 COMMON_SOURCES += lossless_neon.c
+COMMON_SOURCES += mips_macro.h
 COMMON_SOURCES += neon.h
 COMMON_SOURCES += upsampling.c
 COMMON_SOURCES += upsampling_mips_dsp_r2.c
@@ -36,6 +37,7 @@ COMMON_SOURCES += yuv_mips_dsp_r2.c
 ENC_SOURCES =
 ENC_SOURCES += enc.c
 ENC_SOURCES += enc_mips32.c
+ENC_SOURCES += enc_mips_dsp_r2.c
 ENC_SOURCES += enc_neon.c

 libwebpdsp_avx2_la_SOURCES =
--- a/src/dsp/dec_mips_dsp_r2.c
+++ b/src/dsp/dec_mips_dsp_r2.c
@@ -16,6 +16,8 @@

 #if defined(WEBP_USE_MIPS_DSP_R2)

+#include "./mips_macro.h"
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;

@@ -56,48 +58,6 @@ static const int kC2 = 35468;
  "usw              %["#IO4"],  64(%[dst])                    \n\t"            \
  "usw              %["#IO6"],  96(%[dst])                    \n\t"

-// temp0[31..16 | 15..0] = temp8[31..16 | 15..0] + temp12[31..16 | 15..0]
-// temp1[31..16 | 15..0] = temp8[31..16 | 15..0] - temp12[31..16 | 15..0]
-// temp0[31..16 | 15..0] = temp0[31..16 >> 3 | 15..0 >> 3]
-// temp1[31..16 | 15..0] = temp1[31..16 >> 3 | 15..0 >> 3]
-// O - output
-// I - input (macro doesn't change it)
-#define SHIFT_R_SUM_X2(O0, O1, O2, O3, O4, O5, O6, O7,                         \
-                       I0, I1, I2, I3, I4, I5, I6, I7)                         \
-  "addq.ph          %["#O0"],   %["#I0"],   %["#I4"]          \n\t"            \
-  "subq.ph          %["#O1"],   %["#I0"],   %["#I4"]          \n\t"            \
-  "addq.ph          %["#O2"],   %["#I1"],   %["#I5"]          \n\t"            \
-  "subq.ph          %["#O3"],   %["#I1"],   %["#I5"]          \n\t"            \
-  "addq.ph          %["#O4"],   %["#I2"],   %["#I6"]          \n\t"            \
-  "subq.ph          %["#O5"],   %["#I2"],   %["#I6"]          \n\t"            \
-  "addq.ph          %["#O6"],   %["#I3"],   %["#I7"]          \n\t"            \
-  "subq.ph          %["#O7"],   %["#I3"],   %["#I7"]          \n\t"            \
-  "shra.ph          %["#O0"],   %["#O0"],   3                 \n\t"            \
-  "shra.ph          %["#O1"],   %["#O1"],   3                 \n\t"            \
-  "shra.ph          %["#O2"],   %["#O2"],   3                 \n\t"            \
-  "shra.ph          %["#O3"],   %["#O3"],   3                 \n\t"            \
-  "shra.ph          %["#O4"],   %["#O4"],   3                 \n\t"            \
-  "shra.ph          %["#O5"],   %["#O5"],   3                 \n\t"            \
-  "shra.ph          %["#O6"],   %["#O6"],   3                 \n\t"            \
-  "shra.ph          %["#O7"],   %["#O7"],   3                 \n\t"
-
-// preceu.ph.qbr temp0, temp8
-//   temp0 = 0 | 0 | temp8[23..16] | temp8[7..0]
-// preceu.ph.qbl temp1, temp8
-//   temp1 = temp8[23..16] | temp8[7..0] | 0 | 0
-// O - output
-// I - input (macro doesn't change it)
-#define CONVERT_2_BYTES_TO_HALF(O0, O1, O2, O3, O4, O5, O6, O7,                \
-                                I0, I1, I2, I3)                                \
-  "preceu.ph.qbr    %["#O0"],   %["#I0"]                      \n\t"            \
-  "preceu.ph.qbl    %["#O1"],   %["#I0"]                      \n\t"            \
-  "preceu.ph.qbr    %["#O2"],   %["#I1"]                      \n\t"            \
-  "preceu.ph.qbl    %["#O3"],   %["#I1"]                      \n\t"            \
-  "preceu.ph.qbr    %["#O4"],   %["#I2"]                      \n\t"            \
-  "preceu.ph.qbl    %["#O5"],   %["#I2"]                      \n\t"            \
-  "preceu.ph.qbr    %["#O6"],   %["#I3"]                      \n\t"            \
-  "preceu.ph.qbl    %["#O7"],   %["#I3"]                      \n\t"
-
 // O - output
 #define LOAD_DST(O0, O1, O2, O3)                                               \
  "ulw              %["#O0"],  0(%[dst])                      \n\t"            \
@@ -105,52 +65,6 @@ static const int kC2 = 35468;
  "ulw              %["#O2"],  64(%[dst])                     \n\t"            \
  "ulw              %["#O3"],  96(%[dst])                     \n\t"

-// precrq.ph.w temp0, temp8, temp2
-//   temp0 = temp8[31..16] | temp2[31..16]
-// ins temp2, temp8, 16, 16
-//   temp2 = temp8[31..16] | temp2[15..0]
-// O - output
-// IO - input/output
-// I - input (macro doesn't change it)
-#define PACK_2_HALVES_TO_WORD(O0, O1, O2, O3,                                  \
-                              IO0, IO1, IO2, IO3,                              \
-                              I0, I1, I2, I3)                                  \
-  "precrq.ph.w      %["#O0"],    %["#I0"],  %["#IO0"]         \n\t"            \
-  "precrq.ph.w      %["#O1"],    %["#I1"],  %["#IO1"]         \n\t"            \
-  "ins              %["#IO0"],   %["#I0"],  16,    16         \n\t"            \
-  "ins              %["#IO1"],   %["#I1"],  16,    16         \n\t"            \
-  "precrq.ph.w      %["#O2"],    %["#I2"],  %["#IO2"]         \n\t"            \
-  "precrq.ph.w      %["#O3"],    %["#I3"],  %["#IO3"]         \n\t"            \
-  "ins              %["#IO2"],   %["#I2"],  16,    16         \n\t"            \
-  "ins              %["#IO3"],   %["#I3"],  16,    16         \n\t"
-
-// O - output
-// IO - input/output
-// I - input (macro doesn't change it)
-#define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7,                          \
-                      IO0, IO1, IO2, IO3,                                      \
-                      I0, I1, I2, I3, I4, I5, I6, I7)                          \
-  "mul              %["#O0"],   %["#I0"],   %[kC2]            \n\t"            \
-  "mul              %["#O1"],   %["#I0"],   %[kC1]            \n\t"            \
-  "mul              %["#O2"],   %["#I1"],   %[kC2]            \n\t"            \
-  "mul              %["#O3"],   %["#I1"],   %[kC1]            \n\t"            \
-  "mul              %["#O4"],   %["#I2"],   %[kC2]            \n\t"            \
-  "mul              %["#O5"],   %["#I2"],   %[kC1]            \n\t"            \
-  "mul              %["#O6"],   %["#I3"],   %[kC2]            \n\t"            \
-  "mul              %["#O7"],   %["#I3"],   %[kC1]            \n\t"            \
-  "sra              %["#O0"],   %["#O0"],   16                \n\t"            \
-  "sra              %["#O1"],   %["#O1"],   16                \n\t"            \
-  "sra              %["#O2"],   %["#O2"],   16                \n\t"            \
-  "sra              %["#O3"],   %["#O3"],   16                \n\t"            \
-  "sra              %["#O4"],   %["#O4"],   16                \n\t"            \
-  "sra              %["#O5"],   %["#O5"],   16                \n\t"            \
-  "sra              %["#O6"],   %["#O6"],   16                \n\t"            \
-  "sra              %["#O7"],   %["#O7"],   16                \n\t"            \
-  "addu             %["#IO0"],  %["#IO0"],  %["#I4"]          \n\t"            \
-  "addu             %["#IO1"],  %["#IO1"],  %["#I5"]          \n\t"            \
-  "subu             %["#IO2"],  %["#IO2"],  %["#I6"]          \n\t"            \
-  "subu             %["#IO3"],  %["#IO3"],  %["#I7"]          \n\t"
-
 // O - output
 // I - input (macro doesn't change it)
 #define ADD_SUB_HALVES(O0, O1,                                                 \
@@ -158,42 +72,6 @@ static const int kC2 = 35468;
  "addq.ph          %["#O0"],   %["#I0"],  %["#I1"]           \n\t"            \
  "subq.ph          %["#O1"],   %["#I0"],  %["#I1"]           \n\t"

-// O - output
-// I - input (macro doesn't change it)
-// I[0/1] - offset in bytes
-#define LOAD_IN_X2(O0, O1,                                                     \
-                   I0, I1)                                                     \
-  "lh               %["#O0"],   "#I0"(%[in])                  \n\t"            \
-  "lh               %["#O1"],   "#I1"(%[in])                  \n\t"
-
-// O - output
-// I - input (macro doesn't change it)
-#define SRA_16(O0, O1, O2, O3,                                                 \
-               I0, I1, I2, I3)                                                 \
-  "sra              %["#O0"],  %["#I0"],  16                  \n\t"            \
-  "sra              %["#O1"],  %["#I1"],  16                  \n\t"            \
-  "sra              %["#O2"],  %["#I2"],  16                  \n\t"            \
-  "sra              %["#O3"],  %["#I3"],  16                  \n\t"
-
-// O - output
-// I - input (macro doesn't change it)
-#define INSERT_HALF_X2(O0, O1,                                                 \
-                       I0, I1)                                                 \
-  "ins              %["#O0"],   %["#I0"], 16,    16           \n\t"            \
-  "ins              %["#O1"],   %["#I1"], 16,    16           \n\t"
-
-#define OUTPUT_EARLY_CLOBBER_REGS_10()                                         \
-  : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),             \
-    [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),             \
-    [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9),             \
-    [temp10]"=&r"(temp10)
-
-#define OUTPUT_EARLY_CLOBBER_REGS_18()                                         \
-  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
-  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
-  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
-  [temp17]"=&r"(temp17), [temp18]"=&r"(temp18)
-
 static void TransformDC(const int16_t* in, uint8_t* dst) {
  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;

--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -700,6 +700,7 @@ extern void VP8EncDspInitSSE2(void);
 extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  VP8DspInit();  // common inverse transforms
@@ -745,7 +746,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8EncDspInitMIPS32();
    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspInitMIPSdspR2();
+    }
 #endif
  }
 }
-
--- a/src/dsp/enc_mips_dsp_r2.c
+++ b/src/dsp/enc_mips_dsp_r2.c
@@ -0,0 +1,156 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Darko Laus (darko.laus@imgtec.com)
+//            Mirko Raus (mirko.raus@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./mips_macro.h"
+#include "../enc/cost.h"
+#include "../enc/vp8enci.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+#define LOAD_REF(O0, O1, O2, O3)                                               \
+  "ulw              %["#O0"],  0(%[ref])                      \n\t"            \
+  "ulw              %["#O1"],  16(%[ref])                     \n\t"            \
+  "ulw              %["#O2"],  32(%[ref])                     \n\t"            \
+  "ulw              %["#O3"],  48(%[ref])                     \n\t"
+
+#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7,               \
+                         I0, I1, I2, I3, I4, I5, I6, I7)                       \
+  "addq.ph          %["#IO0"],  %["#IO0"],  %["#I0"]          \n\t"            \
+  "addq.ph          %["#IO1"],  %["#IO1"],  %["#I1"]          \n\t"            \
+  "addq.ph          %["#IO2"],  %["#IO2"],  %["#I2"]          \n\t"            \
+  "addq.ph          %["#IO3"],  %["#IO3"],  %["#I3"]          \n\t"            \
+  "addq.ph          %["#IO4"],  %["#IO4"],  %["#I4"]          \n\t"            \
+  "addq.ph          %["#IO5"],  %["#IO5"],  %["#I5"]          \n\t"            \
+  "addq.ph          %["#IO6"],  %["#IO6"],  %["#I6"]          \n\t"            \
+  "addq.ph          %["#IO7"],  %["#IO7"],  %["#I7"]          \n\t"            \
+  "shll_s.ph        %["#IO0"],  %["#IO0"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO1"],  %["#IO1"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO2"],  %["#IO2"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO3"],  %["#IO3"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO4"],  %["#IO4"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO5"],  %["#IO5"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO6"],  %["#IO6"],  7                 \n\t"            \
+  "shll_s.ph        %["#IO7"],  %["#IO7"],  7                 \n\t"            \
+  "precrqu_s.qb.ph  %["#IO0"],  %["#IO1"],  %["#IO0"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO2"],  %["#IO3"],  %["#IO2"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO4"],  %["#IO5"],  %["#IO4"]         \n\t"            \
+  "precrqu_s.qb.ph  %["#IO6"],  %["#IO7"],  %["#IO6"]         \n\t"            \
+  "usw              %["#IO0"],  0(%[dst])                     \n\t"            \
+  "usw              %["#IO2"],  16(%[dst])                    \n\t"            \
+  "usw              %["#IO4"],  32(%[dst])                    \n\t"            \
+  "usw              %["#IO6"],  48(%[dst])                    \n\t"
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_REF(temp10, temp11, temp14, temp15)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+#undef OUTPUT_EARLY_CLOBBER_REGS_18
+#undef OUTPUT_EARLY_CLOBBER_REGS_10
+#undef INSERT_HALF_X2
+#undef SRA_16
+#undef LOAD_IN_X2
+#undef ADD_SUB_HALVES
+#undef MUL_SHIFT_SUM
+#undef PACK_2_HALVES_TO_WORD
+#undef LOAD_REF
+#undef CONVERT_2_BYTES_TO_HALF
+#undef SHIFT_R_SUM_X2
+#undef STORE_SAT_SUM_X2
+#undef MUL
+
+#endif  // WEBP_USE_MIPS_DSP_R2
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
+#if defined(WEBP_USE_MIPS_DSP_R2)
+  VP8ITransform = ITransform;
+#endif  // WEBP_USE_MIPS32
+}
--- a/src/dsp/mips_macro.h
+++ b/src/dsp/mips_macro.h
@@ -0,0 +1,148 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS common macros
+
+#ifndef WEBP_DSP_MIPS_MACRO_H_
+#define WEBP_DSP_MIPS_MACRO_H_
+
+// O0[31..16 | 15..0] = I0[31..16 | 15..0] + I1[31..16 | 15..0]
+// O1[31..16 | 15..0] = I0[31..16 | 15..0] - I1[31..16 | 15..0]
+// O - output
+// I - input (macro doesn't change it)
+#define ADD_SUB_HALVES(O0, O1,                                                 \
+                       I0, I1)                                                 \
+  "addq.ph          %["#O0"],   %["#I0"],  %["#I1"]           \n\t"            \
+  "subq.ph          %["#O1"],   %["#I0"],  %["#I1"]           \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+// I[0/1] - offset in bytes
+#define LOAD_IN_X2(O0, O1,                                                     \
+                   I0, I1)                                                     \
+  "lh               %["#O0"],   "#I0"(%[in])                  \n\t"            \
+  "lh               %["#O1"],   "#I1"(%[in])                  \n\t"
+
+// O - output
+// IO - input/output
+// I - input (macro doesn't change it)
+#define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7,                          \
+                      IO0, IO1, IO2, IO3,                                      \
+                      I0, I1, I2, I3, I4, I5, I6, I7)                          \
+  "mul              %["#O0"],   %["#I0"],   %[kC2]            \n\t"            \
+  "mul              %["#O1"],   %["#I0"],   %[kC1]            \n\t"            \
+  "mul              %["#O2"],   %["#I1"],   %[kC2]            \n\t"            \
+  "mul              %["#O3"],   %["#I1"],   %[kC1]            \n\t"            \
+  "mul              %["#O4"],   %["#I2"],   %[kC2]            \n\t"            \
+  "mul              %["#O5"],   %["#I2"],   %[kC1]            \n\t"            \
+  "mul              %["#O6"],   %["#I3"],   %[kC2]            \n\t"            \
+  "mul              %["#O7"],   %["#I3"],   %[kC1]            \n\t"            \
+  "sra              %["#O0"],   %["#O0"],   16                \n\t"            \
+  "sra              %["#O1"],   %["#O1"],   16                \n\t"            \
+  "sra              %["#O2"],   %["#O2"],   16                \n\t"            \
+  "sra              %["#O3"],   %["#O3"],   16                \n\t"            \
+  "sra              %["#O4"],   %["#O4"],   16                \n\t"            \
+  "sra              %["#O5"],   %["#O5"],   16                \n\t"            \
+  "sra              %["#O6"],   %["#O6"],   16                \n\t"            \
+  "sra              %["#O7"],   %["#O7"],   16                \n\t"            \
+  "addu             %["#IO0"],  %["#IO0"],  %["#I4"]          \n\t"            \
+  "addu             %["#IO1"],  %["#IO1"],  %["#I5"]          \n\t"            \
+  "subu             %["#IO2"],  %["#IO2"],  %["#I6"]          \n\t"            \
+  "subu             %["#IO3"],  %["#IO3"],  %["#I7"]          \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+#define INSERT_HALF_X2(O0, O1,                                                 \
+                       I0, I1)                                                 \
+  "ins              %["#O0"],   %["#I0"], 16,    16           \n\t"            \
+  "ins              %["#O1"],   %["#I1"], 16,    16           \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+#define SRA_16(O0, O1, O2, O3,                                                 \
+               I0, I1, I2, I3)                                                 \
+  "sra              %["#O0"],  %["#I0"],  16                  \n\t"            \
+  "sra              %["#O1"],  %["#I1"],  16                  \n\t"            \
+  "sra              %["#O2"],  %["#I2"],  16                  \n\t"            \
+  "sra              %["#O3"],  %["#I3"],  16                  \n\t"
+
+// temp0[31..16 | 15..0] = temp8[31..16 | 15..0] + temp12[31..16 | 15..0]
+// temp1[31..16 | 15..0] = temp8[31..16 | 15..0] - temp12[31..16 | 15..0]
+// temp0[31..16 | 15..0] = temp0[31..16 >> 3 | 15..0 >> 3]
+// temp1[31..16 | 15..0] = temp1[31..16 >> 3 | 15..0 >> 3]
+// O - output
+// I - input (macro doesn't change it)
+#define SHIFT_R_SUM_X2(O0, O1, O2, O3, O4, O5, O6, O7,                         \
+                       I0, I1, I2, I3, I4, I5, I6, I7)                         \
+  "addq.ph          %["#O0"],   %["#I0"],   %["#I4"]          \n\t"            \
+  "subq.ph          %["#O1"],   %["#I0"],   %["#I4"]          \n\t"            \
+  "addq.ph          %["#O2"],   %["#I1"],   %["#I5"]          \n\t"            \
+  "subq.ph          %["#O3"],   %["#I1"],   %["#I5"]          \n\t"            \
+  "addq.ph          %["#O4"],   %["#I2"],   %["#I6"]          \n\t"            \
+  "subq.ph          %["#O5"],   %["#I2"],   %["#I6"]          \n\t"            \
+  "addq.ph          %["#O6"],   %["#I3"],   %["#I7"]          \n\t"            \
+  "subq.ph          %["#O7"],   %["#I3"],   %["#I7"]          \n\t"            \
+  "shra.ph          %["#O0"],   %["#O0"],   3                 \n\t"            \
+  "shra.ph          %["#O1"],   %["#O1"],   3                 \n\t"            \
+  "shra.ph          %["#O2"],   %["#O2"],   3                 \n\t"            \
+  "shra.ph          %["#O3"],   %["#O3"],   3                 \n\t"            \
+  "shra.ph          %["#O4"],   %["#O4"],   3                 \n\t"            \
+  "shra.ph          %["#O5"],   %["#O5"],   3                 \n\t"            \
+  "shra.ph          %["#O6"],   %["#O6"],   3                 \n\t"            \
+  "shra.ph          %["#O7"],   %["#O7"],   3                 \n\t"
+
+// precrq.ph.w temp0, temp8, temp2
+//   temp0 = temp8[31..16] | temp2[31..16]
+// ins temp2, temp8, 16, 16
+//   temp2 = temp8[31..16] | temp2[15..0]
+// O - output
+// IO - input/output
+// I - input (macro doesn't change it)
+#define PACK_2_HALVES_TO_WORD(O0, O1, O2, O3,                                  \
+                              IO0, IO1, IO2, IO3,                              \
+                              I0, I1, I2, I3)                                  \
+  "precrq.ph.w      %["#O0"],    %["#I0"],  %["#IO0"]         \n\t"            \
+  "precrq.ph.w      %["#O1"],    %["#I1"],  %["#IO1"]         \n\t"            \
+  "ins              %["#IO0"],   %["#I0"],  16,    16         \n\t"            \
+  "ins              %["#IO1"],   %["#I1"],  16,    16         \n\t"            \
+  "precrq.ph.w      %["#O2"],    %["#I2"],  %["#IO2"]         \n\t"            \
+  "precrq.ph.w      %["#O3"],    %["#I3"],  %["#IO3"]         \n\t"            \
+  "ins              %["#IO2"],   %["#I2"],  16,    16         \n\t"            \
+  "ins              %["#IO3"],   %["#I3"],  16,    16         \n\t"
+
+// preceu.ph.qbr temp0, temp8
+//   temp0 = 0 | 0 | temp8[23..16] | temp8[7..0]
+// preceu.ph.qbl temp1, temp8
+//   temp1 = temp8[23..16] | temp8[7..0] | 0 | 0
+// O - output
+// I - input (macro doesn't change it)
+#define CONVERT_2_BYTES_TO_HALF(O0, O1, O2, O3, O4, O5, O6, O7,                \
+                                I0, I1, I2, I3)                                \
+  "preceu.ph.qbr    %["#O0"],   %["#I0"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O1"],   %["#I0"]                      \n\t"            \
+  "preceu.ph.qbr    %["#O2"],   %["#I1"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O3"],   %["#I1"]                      \n\t"            \
+  "preceu.ph.qbr    %["#O4"],   %["#I2"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O5"],   %["#I2"]                      \n\t"            \
+  "preceu.ph.qbr    %["#O6"],   %["#I3"]                      \n\t"            \
+  "preceu.ph.qbl    %["#O7"],   %["#I3"]                      \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_10()                                         \
+  : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),             \
+    [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),             \
+    [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9),             \
+    [temp10]"=&r"(temp10)
+
+#define OUTPUT_EARLY_CLOBBER_REGS_18()                                         \
+  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
+  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
+  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
+  [temp17]"=&r"(temp17), [temp18]"=&r"(temp18)
+
+#endif  // WEBP_DSP_MIPS_MACRO_H_