From 9966cc8d1269c8ac105b08d9332727bdb5c110bb Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Wed, 29 Nov 2017 11:58:38 +0800
Subject: [PATCH 1/2] vp8: [loongson] optimize vp8_short_fdct4x4_mmi v2.

Optimize the calculate process of a,b,c,d.

Change-Id: I81717e47bc988ace1412d478513e7dd3cb6b0cc9
---
 vp8/encoder/mips/mmi/dct_mmi.c | 135 ++++++++++++++++-----------------
 1 file changed, 67 insertions(+), 68 deletions(-)

diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c
index 7e45a1278..1f60a692d 100644
--- a/vp8/encoder/mips/mmi/dct_mmi.c
+++ b/vp8/encoder/mips/mmi/dct_mmi.c
@@ -13,9 +13,13 @@
 #include "vpx_ports/asmdefs_mmi.h"
 
 /* clang-format off */
+/* TRANSPOSE_4H: transpose 4x4 matrix.
+   Input: ftmp1,ftmp2,ftmp3,ftmp4
+   Output: ftmp1,ftmp2,ftmp3,ftmp4
+   Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
+ */
 #define TRANSPOSE_4H                                         \
   MMI_LI(%[tmp0], 0x93)                                      \
-  "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]         \n\t" \
   "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
   "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
   "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
@@ -40,8 +44,8 @@
 /* clang-format on */
 
 void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
-  int pitch_half = pitch / 2;
   uint64_t tmp[1];
+  int16_t *ip = input;
 
 #if _MIPS_SIM == _ABIO32
   register double ftmp0 asm("$f0");
@@ -81,52 +85,51 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
   DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
   DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
-
-  DECLARE_ALIGNED(16, int, a[4]);
-  DECLARE_ALIGNED(16, int, b[4]);
-  DECLARE_ALIGNED(16, int, c[4]);
-  DECLARE_ALIGNED(16, int, d[4]);
-
-  // stage1
-  a[0] = (input[0] + input[3]) * 8;
-  a[1] = (input[0 + pitch_half] + input[3 + pitch_half]) * 8;
-  a[2] = (input[0 + 2 * pitch_half] + input[3 + 2 * pitch_half]) * 8;
-  a[3] = (input[0 + 3 * pitch_half] + input[3 + 3 * pitch_half]) * 8;
-
-  b[0] = (input[1] + input[2]) * 8;
-  b[1] = (input[1 + pitch_half] + input[2 + pitch_half]) * 8;
-  b[2] = (input[1 + 2 * pitch_half] + input[2 + 2 * pitch_half]) * 8;
-  b[3] = (input[1 + 3 * pitch_half] + input[2 + 3 * pitch_half]) * 8;
-
-  c[0] = (input[1] - input[2]) * 8;
-  c[1] = (input[1 + pitch_half] - input[2 + pitch_half]) * 8;
-  c[2] = (input[1 + 2 * pitch_half] - input[2 + 2 * pitch_half]) * 8;
-  c[3] = (input[1 + 3 * pitch_half] - input[2 + 3 * pitch_half]) * 8;
-
-  d[0] = (input[0] - input[3]) * 8;
-  d[1] = (input[0 + pitch_half] - input[3 + pitch_half]) * 8;
-  d[2] = (input[0 + 2 * pitch_half] - input[3 + 2 * pitch_half]) * 8;
-  d[3] = (input[0 + 3 * pitch_half] - input[3 + 3 * pitch_half]) * 8;
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
 
   __asm__ volatile (
-    "gslqc1     %[ftmp2],   %[ftmp1],       0x00(%[a])      \n\t"
-    "gslqc1     %[ftmp4],   %[ftmp3],       0x00(%[b])      \n\t"
-    "gslqc1     %[ftmp6],   %[ftmp5],       0x00(%[c])      \n\t"
-    "gslqc1     %[ftmp8],   %[ftmp7],       0x00(%[d])      \n\t"
+    "xor        %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    TRANSPOSE_4H
 
-    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]        \n\t"
-    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]        \n\t"
-    "psubw      %[ftmp11],  %[ftmp1],       %[ftmp3]        \n\t"
-    "psubw      %[ftmp12],  %[ftmp2],       %[ftmp4]        \n\t"
-    "packsswh   %[ftmp1],   %[ftmp9],       %[ftmp10]       \n\t"
-    "packsswh   %[ftmp3],   %[ftmp11],      %[ftmp12]       \n\t"
-    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
-    "packsswh   %[ftmp4],   %[ftmp7],       %[ftmp8]        \n\t"
+    "ldc1       %[ftmp11],  %[ff_ph_8]                      \n\t"
+    // f1 + f4
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    // a1
+    "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    // f2 + f3
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    // b1
+    "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    // f2 - f3
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    // c1
+    "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp11]       \n\t"
+    // f1 - f4
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+    // d1
+    "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp11]       \n\t"
+    // op[0] = a1 + b1
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    // op[2] = a1 - b1
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
     MMI_LI(%[tmp0], 0x0c)
-    "mov.d      %[ftmp7],   %[ftmp2]                        \n\t"
-    "mov.d      %[ftmp8],   %[ftmp4]                        \n\t"
     "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
-
     "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
     "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
     "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
@@ -138,6 +141,7 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
     "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
 
+    // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
     "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
     "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
     "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
@@ -150,7 +154,6 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
     TRANSPOSE_4H
 
-    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
     "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
     "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
     "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
@@ -163,17 +166,16 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
     "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
     "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
-    MMI_LI(%[tmp0], 0x04)
     "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
     "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+    MMI_LI(%[tmp0], 0x04)
     "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
     "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
     "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
 
     MMI_LI(%[tmp0], 0x10)
-    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
     "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
-
+    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
     "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
     "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
     "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
@@ -196,31 +198,28 @@ void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
     "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"
 
+    "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
+    "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
+    "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
+    "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
+
     : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
       [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
       [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
       [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
-      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0])
-    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), [a] "r"(a),
-      [b] "r"(b), [c] "r"(c), [d] "r"(d), [ff_ph_op1] "f"(ff_ph_op1),
-      [ff_ph_op3] "f"(ff_ph_op3), [ff_pw_14500] "m"(ff_pw_14500),
-      [ff_pw_7500] "m"(ff_pw_7500), [ff_pw_12000] "m"(ff_pw_12000),
-      [ff_pw_51000] "m"(ff_pw_51000)
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
+      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
+      [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
+      [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
+      [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
+      [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
+    : "memory"
   );
-
-  __asm__ volatile(
-      "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
-      "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
-      "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
-      "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
-      "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
-      "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
-      "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
-      "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
-      :
-      : [ftmp1] "f"(ftmp1), [ftmp2] "f"(ftmp2), [ftmp3] "f"(ftmp3),
-        [ftmp4] "f"(ftmp4), [output] "r"(output)
-      : "memory");
 }
 
 void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
@@ -277,7 +276,7 @@ void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
     "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"
 
     "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
-    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]           \n\t"
+    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]         \n\t"
     "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
     TRANSPOSE_4H
 

From d49bf26b1c076d167cc9488faaa7d766bd5120a3 Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Wed, 29 Nov 2017 16:59:22 +0800
Subject: [PATCH 2/2] vp8: [loongson] optimize regular quantize v2.

1. Optimize the memset with mmi.
2. Optimize macro REGULAR_SELECT_EOB.

Change-Id: Icd9c866b0e6aef08874b2f123e9b0e09919445ff
---
 vp8/encoder/mips/mmi/vp8_quantize_mmi.c | 77 ++++++++++++++++---------
 1 file changed, 51 insertions(+), 26 deletions(-)

diff --git a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
index 22b12bbab..3ccb196ff 100644
--- a/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
+++ b/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -18,13 +18,14 @@
   z = coeff_ptr[rc];                                                     \
   sz = (z >> 31);                                                        \
   x = (z ^ sz) - sz;                                                     \
-  if (x >= (zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value)) {       \
+  zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value;             \
+  if (x >= zbin) {                                                       \
     x += round_ptr[rc];                                                  \
     y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
-    x = (y ^ sz) - sz;                                                   \
-    qcoeff_ptr[rc] = x;                                                  \
-    dqcoeff_ptr[rc] = x * dequant_ptr[rc];                               \
     if (y) {                                                             \
+      x = (y ^ sz) - sz;                                                 \
+      qcoeff_ptr[rc] = x;                                                \
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];                             \
       eob = i;                                                           \
       zbin_boost_ptr = b->zrun_zbin_boost;                               \
     }                                                                    \
@@ -198,8 +199,8 @@ void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
 }
 
 void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
-  int eob;
-  int x, y, z, sz;
+  int eob = 0;
+  int x, y, z, sz, zbin;
   const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
   const int16_t *coeff_ptr = b->coeff;
   const int16_t *zbin_ptr = b->zbin;
@@ -210,28 +211,52 @@ void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
   int16_t *dqcoeff_ptr = d->dqcoeff;
   const int16_t *dequant_ptr = d->dequant;
   const int16_t zbin_oq_value = b->zbin_extra;
+  register double ftmp0 asm("$f0");
 
-  memset(qcoeff_ptr, 0, 32);
-  memset(dqcoeff_ptr, 0, 32);
+  //  memset(qcoeff_ptr, 0, 32);
+  //  memset(dqcoeff_ptr, 0, 32);
+  /* clang-format off */
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "gssdlc1    %[ftmp0],   0x07(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[qcoeff_ptr])             \n\t"
 
-  eob = -1;
+    "gssdlc1    %[ftmp0],   0x07(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[dqcoeff_ptr])            \n\t"
+    : [ftmp0]"=&f"(ftmp0)
+    : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr)
+    : "memory"
+  );
+  /* clang-format on */
 
-  REGULAR_SELECT_EOB(0, 0);
-  REGULAR_SELECT_EOB(1, 1);
-  REGULAR_SELECT_EOB(2, 4);
-  REGULAR_SELECT_EOB(3, 8);
-  REGULAR_SELECT_EOB(4, 5);
-  REGULAR_SELECT_EOB(5, 2);
-  REGULAR_SELECT_EOB(6, 3);
-  REGULAR_SELECT_EOB(7, 6);
-  REGULAR_SELECT_EOB(8, 9);
-  REGULAR_SELECT_EOB(9, 12);
-  REGULAR_SELECT_EOB(10, 13);
-  REGULAR_SELECT_EOB(11, 10);
-  REGULAR_SELECT_EOB(12, 7);
-  REGULAR_SELECT_EOB(13, 11);
-  REGULAR_SELECT_EOB(14, 14);
-  REGULAR_SELECT_EOB(15, 15);
+  REGULAR_SELECT_EOB(1, 0);
+  REGULAR_SELECT_EOB(2, 1);
+  REGULAR_SELECT_EOB(3, 4);
+  REGULAR_SELECT_EOB(4, 8);
+  REGULAR_SELECT_EOB(5, 5);
+  REGULAR_SELECT_EOB(6, 2);
+  REGULAR_SELECT_EOB(7, 3);
+  REGULAR_SELECT_EOB(8, 6);
+  REGULAR_SELECT_EOB(9, 9);
+  REGULAR_SELECT_EOB(10, 12);
+  REGULAR_SELECT_EOB(11, 13);
+  REGULAR_SELECT_EOB(12, 10);
+  REGULAR_SELECT_EOB(13, 7);
+  REGULAR_SELECT_EOB(14, 11);
+  REGULAR_SELECT_EOB(15, 14);
+  REGULAR_SELECT_EOB(16, 15);
 
-  *d->eob = (char)(eob + 1);
+  *d->eob = (char)eob;
 }