vp9: [loongson] optimize vpx_convolve8 with mmi.

1. vpx_convolve8_vert_mmi 2. vpx_convolve8_horiz_mmi 3. vpx_convolve8_mmi 4. vpx_convolve8_avg_mmi 5. vpx_convolve8_avg_vert_mmi Change-Id: I41a6b3b4f327d6b67d282e0163cfa0aee8648abe
2018-01-25 09:38:28 +08:00 · 2018-01-25 09:38:28 +08:00 · 25d9adb74b
commit 25d9adb74b
parent 7b9984b386
4 changed files with 605 additions and 5 deletions
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@ -1379,4 +1379,16 @@ const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) };
 INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest,
                        ::testing::ValuesIn(kArrayConvolve_vsx));
 #endif  // HAVE_VSX
+
+#if HAVE_MMI
+const ConvolveFunctions convolve8_mmi(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_mmi,
+    vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_mmi,
+    vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) };
+INSTANTIATE_TEST_CASE_P(MMI, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_mmi));
+#endif  // HAVE_MMI
 }  // namespace
--- a/vpx_dsp/mips/vpx_convolve8_mmi.c
+++ b/vpx_dsp/mips/vpx_convolve8_mmi.c
@ -0,0 +1,587 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx_ports/mem.h"
+
+#define GET_DATA_H_MMI                                     \
+  "pmaddhw    %[ftmp4],    %[ftmp4],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp5],    %[ftmp5],   %[filter2]    \n\t" \
+  "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+  "punpckhwd  %[ftmp5],    %[ftmp4],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[ftmp6],    %[ftmp6],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp7],    %[ftmp7],   %[filter2]    \n\t" \
+  "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+  "punpckhwd  %[ftmp7],    %[ftmp6],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+  "punpcklwd  %[srcl],     %[ftmp4],   %[ftmp6]      \n\t" \
+  "pmaddhw    %[ftmp8],    %[ftmp8],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp9],    %[ftmp9],   %[filter2]    \n\t" \
+  "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+  "punpckhwd  %[ftmp9],    %[ftmp8],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp10],   %[ftmp10],  %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp11],   %[ftmp11],  %[filter2]    \n\t" \
+  "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]      \n\t" \
+  "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "punpcklwd  %[srch],     %[ftmp8],   %[ftmp10]     \n\t"
+
+#define GET_DATA_V_MMI                                     \
+  "punpcklhw  %[srcl],     %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[srcl],     %[srcl],    %[filter10]   \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[srch],     %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[srch],     %[srch],    %[filter10]   \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t"
+
+/* clang-format off */
+#define ROUND_POWER_OF_TWO_MMI                             \
+  /* Add para[0] */                                        \
+  "lw         %[tmp0],     0x00(%[para])             \n\t" \
+  MMI_MTC1(%[tmp0],     %[ftmp6])                          \
+  "punpcklwd  %[ftmp6],    %[ftmp6],    %[ftmp6]     \n\t" \
+  "paddw      %[srcl],     %[srcl],     %[ftmp6]     \n\t" \
+  "paddw      %[srch],     %[srch],     %[ftmp6]     \n\t" \
+  /* Arithmetic right shift para[1] bits */                \
+  "lw         %[tmp0],     0x04(%[para])             \n\t" \
+  MMI_MTC1(%[tmp0],     %[ftmp5])                          \
+  "psraw      %[srcl],     %[srcl],     %[ftmp5]     \n\t" \
+  "psraw      %[srch],     %[srch],     %[ftmp5]     \n\t"
+/* clang-format on */
+
+#define CLIP_PIXEL_MMI                                     \
+  /* Staturated operation */                               \
+  "packsswh   %[srcl],     %[srcl],     %[srch]      \n\t" \
+  "packushb   %[ftmp12],   %[srcl],     %[ftmp0]     \n\t"
+
+static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_x = filter[x0_q4];
+  double ftmp[14];
+  uint32_t tmp[2];
+  uint32_t para[5];
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= SUBPEL_TAPS / 2 - 1;
+  src_stride -= w;
+  dst_stride -= w;
+  (void)x_step_q4;
+
+  /* clang-format off */
+  __asm__ volatile(
+    "move       %[tmp1],    %[width]                   \n\t"
+    "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+    "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+    "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+    "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per row */
+    "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+    "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+    "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+    "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+    "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+    "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+    "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],    -0x04)
+    /* Get raw data */
+    GET_DATA_H_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],   1b                         \n\t"
+    "move       %[width],   %[tmp1]                    \n\t"
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],  1b                         \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+      [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+      [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+      [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+      [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+      [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+      [src]"+&r"(src),          [width]"+&r"(w),
+      [dst]"+&r"(dst),          [height]"+&r"(h)
+    : [filter]"r"(filter_x),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int y0_q4,
+                              int y_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_y = filter[y0_q4];
+  double ftmp[16];
+  uint32_t tmp[1];
+  uint32_t para[2];
+  ptrdiff_t addr = src_stride;
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  src_stride -= w;
+  dst_stride -= w;
+  (void)y_step_q4;
+
+  __asm__ volatile(
+    "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+    "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+    "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+    "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+    "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per column */
+    "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+    MMI_ADDU(%[tmp0],     %[src],     %[addr])
+    "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],   -0x04)
+    /* Get raw data */
+    GET_DATA_V_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],    1b                        \n\t"
+    MMI_SUBU(%[width],    %[addr],     %[src_stride])
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],   1b                        \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+      [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+      [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+      [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+      [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+      [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+      [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+      [src]"+&r"(src),          [dst]"+&r"(dst),
+      [width]"+&r"(w),          [height]"+&r"(h),
+      [tmp0]"=&r"(tmp[0])
+    : [filter]"r"(filter_y),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride),
+      [addr]"r"((mips_reg)addr)
+    : "memory"
+  );
+}
+
+static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int y0_q4,
+                                  int y_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_y = filter[y0_q4];
+  double ftmp[16];
+  uint32_t tmp[1];
+  uint32_t para[2];
+  ptrdiff_t addr = src_stride;
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  src_stride -= w;
+  dst_stride -= w;
+  (void)y_step_q4;
+
+  __asm__ volatile(
+    "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+    "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+    "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+    "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+    "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per column */
+    "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+    MMI_ADDU(%[tmp0],     %[src],     %[addr])
+    "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],   -0x04)
+    /* Get raw data */
+    GET_DATA_V_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "punpcklbh  %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x07(%[dst])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[dst])              \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "paddh      %[ftmp12],   %[ftmp12],  %[ftmp4]      \n\t"
+    "li         %[tmp0],     0x10001                   \n\t"
+    MMI_MTC1(%[tmp0],     %[ftmp5])
+    "punpcklhw  %[ftmp5],    %[ftmp5],   %[ftmp5]      \n\t"
+    "paddh      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+    "psrah      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+    "packushb   %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+    "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],    1b                        \n\t"
+    MMI_SUBU(%[width],    %[addr],     %[src_stride])
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],   1b                        \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+      [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+      [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+      [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+      [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+      [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+      [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+      [src]"+&r"(src),          [dst]"+&r"(dst),
+      [width]"+&r"(w),          [height]"+&r"(h),
+      [tmp0]"=&r"(tmp[0])
+    : [filter]"r"(filter_y),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride),
+      [addr]"r"((mips_reg)addr)
+    : "memory"
+  );
+}
+
+void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  double ftmp[4];
+  uint32_t tmp[2];
+  src_stride -= w;
+  dst_stride -= w;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  __asm__ volatile(
+    "move       %[tmp1],    %[width]                  \n\t"
+    "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
+    "li         %[tmp0],    0x10001                   \n\t"
+    MMI_MTC1(%[tmp0],    %[ftmp3])
+    "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
+    "1:                                               \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src])              \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src])              \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[dst])              \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[dst])              \n\t"
+    "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+    "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+    "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
+    MMI_ADDIU(%[width],  %[width],   -0x04)
+    MMI_ADDIU(%[dst],    %[dst],     0x04)
+    MMI_ADDIU(%[src],    %[src],     0x04)
+    "bnez       %[width],   1b                        \n\t"
+    "move       %[width],   %[tmp1]                   \n\t"
+    MMI_ADDU(%[dst],     %[dst],     %[dst_stride])
+    MMI_ADDU(%[src],     %[src],     %[src_stride])
+    MMI_ADDIU(%[height], %[height],  -0x01)
+    "bnez       %[height],  1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
+      [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
+      [src]"+&r"(src),        [dst]"+&r"(dst),
+      [width]"+&r"(w),        [height]"+&r"(h)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *y_filters, int y0_q4,
+                              int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+          dst[y * dst_stride] +
+              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+          1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w & 0x03) {
+    convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+                   64, filter, x0_q4, x_step_q4, w, intermediate_height);
+    convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                  filter, y0_q4, y_step_q4, w, h);
+  } else {
+    convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                       temp, 64, filter, x0_q4, x_step_q4, w,
+                       intermediate_height);
+    convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                      filter, y0_q4, y_step_q4, w, h);
+  }
+}
+
+void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                             int32_t w, int32_t h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  if (w & 0x03)
+    convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                   w, h);
+  else
+    convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  if (w & 0x03)
+    convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                  h);
+  else
+    convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+                      y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                int w, int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  if (w & 0x03)
+    convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
+                      y_step_q4, w, h);
+  else
+    convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+                          y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                           int32_t w, int32_t h) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+                    y_step_q4, w, h);
+  if (w & 0x03)
+    vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+  else
+    vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@ -134,6 +134,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+DSP_SRCS-$(HAVE_MMI) += mips/vpx_convolve8_mmi.c

 # common (dspr2)
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -366,22 +366,22 @@ add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride,
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;

 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;

 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;

 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon msa/;