VPX: removed step checks from mips convolve code

The check is handled by the predictor table. Change-Id: I5e5084ebb46be8087c8c9d80b5f76e919a1cd05b
2015-08-13 11:27:04 -07:00
parent 1aa84e03fd
commit aeea00cc4f
15 changed files with 337 additions and 420 deletions
--- a/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -233,47 +233,41 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4,
                                  int w, int h) {
-  if (16 == y_step_q4) {
+  uint32_t pos = 38;
    uint32_t pos = 38;
-    /* bit positon for extract from acc */
+  assert(y_step_q4 == 16);
    __asm__ __volatile__ (
      "wrdsp      %[pos],     1           \n\t"
      :
      : [pos] "r" (pos)
    );
-    prefetch_store(dst);
+  /* bit positon for extract from acc */
  __asm__ __volatile__ (
    "wrdsp      %[pos],     1           \n\t"
    :
    : [pos] "r" (pos)
  );
-    switch (w) {
+  prefetch_store(dst);
-      case 4:
+
-      case 8:
+  switch (w) {
-      case 16:
+    case 4:
-      case 32:
+    case 8:
-        convolve_bi_avg_vert_4_dspr2(src, src_stride,
+    case 16:
-                                     dst, dst_stride,
+    case 32:
-                                     filter_y, w, h);
+      convolve_bi_avg_vert_4_dspr2(src, src_stride,
-        break;
+                                   dst, dst_stride,
-      case 64:
+                                   filter_y, w, h);
-        prefetch_store(dst + 32);
+      break;
-        convolve_bi_avg_vert_64_dspr2(src, src_stride,
+    case 64:
-                                      dst, dst_stride,
+      prefetch_store(dst + 32);
-                                      filter_y, h);
+      convolve_bi_avg_vert_64_dspr2(src, src_stride,
-        break;
+                                    dst, dst_stride,
-      default:
+                                    filter_y, h);
-        vpx_convolve8_avg_vert_c(src, src_stride,
+      break;
-                                 dst, dst_stride,
+    default:
-                                 filter_x, x_step_q4,
+      vpx_convolve8_avg_vert_c(src, src_stride,
-                                 filter_y, y_step_q4,
+                               dst, dst_stride,
-                                 w, h);
+                               filter_x, x_step_q4,
-        break;
+                               filter_y, y_step_q4,
-    }
+                               w, h);
-  } else {
+      break;
    vpx_convolve8_avg_vert_c(src, src_stride,
                             dst, dst_stride,
                             filter_x, x_step_q4,
                             filter_y, y_step_q4,
                             w, h);
  }
 }
 #endif
--- a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -768,64 +768,58 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
-  if (16 == x_step_q4) {
+  uint32_t pos = 38;
    uint32_t pos = 38;
-    /* bit positon for extract from acc */
+  assert(x_step_q4 == 16);
    __asm__ __volatile__ (
      "wrdsp      %[pos],     1           \n\t"
      :
      : [pos] "r" (pos)
    );
-    /* prefetch data to cache memory */
+  /* bit positon for extract from acc */
-    prefetch_load(src);
+  __asm__ __volatile__ (
-    prefetch_load(src + 32);
+    "wrdsp      %[pos],     1           \n\t"
-    prefetch_store(dst);
+    :
    : [pos] "r" (pos)
  );
-    switch (w) {
+  /* prefetch data to cache memory */
-      case 4:
+  prefetch_load(src);
-        convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+  prefetch_load(src + 32);
-                                     dst, dst_stride,
+  prefetch_store(dst);
                                     filter_x, h);
        break;
      case 8:
        convolve_bi_avg_horiz_8_dspr2(src, src_stride,
                                     dst, dst_stride,
                                     filter_x, h);
        break;
      case 16:
        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_x, h, 1);
        break;
      case 32:
        convolve_bi_avg_horiz_16_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_x, h, 2);
        break;
      case 64:
        prefetch_load(src + 64);
        prefetch_store(dst + 32);
-        convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+  switch (w) {
-                                      dst, dst_stride,
+    case 4:
-                                      filter_x, h);
+      convolve_bi_avg_horiz_4_dspr2(src, src_stride,
-        break;
+                                   dst, dst_stride,
-      default:
+                                   filter_x, h);
-        vpx_convolve8_avg_horiz_c(src, src_stride,
+      break;
-                                  dst, dst_stride,
+    case 8:
-                                  filter_x, x_step_q4,
+      convolve_bi_avg_horiz_8_dspr2(src, src_stride,
-                                  filter_y, y_step_q4,
+                                   dst, dst_stride,
-                                  w, h);
+                                   filter_x, h);
-        break;
+      break;
-    }
+    case 16:
-  } else {
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
-    vpx_convolve8_avg_horiz_c(src, src_stride,
+                                    dst, dst_stride,
-                              dst, dst_stride,
+                                    filter_x, h, 1);
-                              filter_x, x_step_q4,
+      break;
-                              filter_y, y_step_q4,
+    case 32:
-                              w, h);
+      convolve_bi_avg_horiz_16_dspr2(src, src_stride,
                                    dst, dst_stride,
                                    filter_x, h, 2);
      break;
    case 64:
      prefetch_load(src + 64);
      prefetch_store(dst + 32);
      convolve_bi_avg_horiz_64_dspr2(src, src_stride,
                                    dst, dst_stride,
                                    filter_x, h);
      break;
    default:
      vpx_convolve8_avg_horiz_c(src, src_stride,
                                dst, dst_stride,
                                filter_x, x_step_q4,
                                filter_y, y_step_q4,
                                w, h);
      break;
  }
 }
 #endif
--- a/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -646,66 +646,60 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
-  if (16 == x_step_q4) {
+  uint32_t pos = 38;
    uint32_t pos = 38;
-    prefetch_load((const uint8_t *)filter_x);
+  assert(x_step_q4 == 16);
-    /* bit positon for extract from acc */
+  prefetch_load((const uint8_t *)filter_x);
    __asm__ __volatile__ (
      "wrdsp      %[pos],     1           \n\t"
      :
      : [pos] "r" (pos)
    );
-    /* prefetch data to cache memory */
+  /* bit positon for extract from acc */
-    prefetch_load(src);
+  __asm__ __volatile__ (
-    prefetch_load(src + 32);
+    "wrdsp      %[pos],     1           \n\t"
-    prefetch_store(dst);
+    :
    : [pos] "r" (pos)
  );
-    switch (w) {
+  /* prefetch data to cache memory */
-      case 4:
+  prefetch_load(src);
-        convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+  prefetch_load(src + 32);
-                                  dst, (int32_t)dst_stride,
+  prefetch_store(dst);
                                  filter_x, (int32_t)h);
        break;
      case 8:
        convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
                                  dst, (int32_t)dst_stride,
                                  filter_x, (int32_t)h);
        break;
      case 16:
        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
                                   dst, (int32_t)dst_stride,
                                   filter_x, (int32_t)h, 1);
        break;
      case 32:
        convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
                                   dst, (int32_t)dst_stride,
                                   filter_x, (int32_t)h, 2);
        break;
      case 64:
        prefetch_load(src + 64);
        prefetch_store(dst + 32);
-        convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+  switch (w) {
-                                   dst, (int32_t)dst_stride,
+    case 4:
-                                   filter_x, (int32_t)h);
+      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
-        break;
+                                dst, (int32_t)dst_stride,
-      default:
+                                filter_x, (int32_t)h);
-        vpx_convolve8_horiz_c(src, src_stride,
+      break;
-                              dst, dst_stride,
+    case 8:
-                              filter_x, x_step_q4,
+      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
-                              filter_y, y_step_q4,
+                                dst, (int32_t)dst_stride,
-                              w, h);
+                                filter_x, (int32_t)h);
-        break;
+      break;
-    }
+    case 16:
-  } else {
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
-    vpx_convolve8_horiz_c(src, src_stride,
+                                 dst, (int32_t)dst_stride,
-                          dst, dst_stride,
+                                 filter_x, (int32_t)h, 1);
-                          filter_x, x_step_q4,
+      break;
-                          filter_y, y_step_q4,
+    case 32:
-                          w, h);
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
                                 dst, (int32_t)dst_stride,
                                 filter_x, (int32_t)h, 2);
      break;
    case 64:
      prefetch_load(src + 64);
      prefetch_store(dst + 32);
      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
                                 dst, (int32_t)dst_stride,
                                 filter_x, (int32_t)h);
      break;
    default:
      vpx_convolve8_horiz_c(src, src_stride,
                            dst, dst_stride,
                            filter_x, x_step_q4,
                            filter_y, y_step_q4,
                            w, h);
      break;
  }
 }
 #endif
--- a/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -218,47 +218,41 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
-  if (16 == y_step_q4) {
+  uint32_t pos = 38;
    uint32_t pos = 38;
-    /* bit positon for extract from acc */
+  assert(y_step_q4 == 16);
    __asm__ __volatile__ (
      "wrdsp      %[pos],     1           \n\t"
      :
      : [pos] "r" (pos)
    );
-    prefetch_store(dst);
+  /* bit positon for extract from acc */
  __asm__ __volatile__ (
    "wrdsp      %[pos],     1           \n\t"
    :
    : [pos] "r" (pos)
  );
-    switch (w) {
+  prefetch_store(dst);
-      case 4 :
+
-      case 8 :
+  switch (w) {
-      case 16 :
+    case 4 :
-      case 32 :
+    case 8 :
-        convolve_bi_vert_4_dspr2(src, src_stride,
+    case 16 :
-                                 dst, dst_stride,
+    case 32 :
-                                 filter_y, w, h);
+      convolve_bi_vert_4_dspr2(src, src_stride,
-        break;
+                               dst, dst_stride,
-      case 64 :
+                               filter_y, w, h);
-        prefetch_store(dst + 32);
+      break;
-        convolve_bi_vert_64_dspr2(src, src_stride,
+    case 64 :
-                                  dst, dst_stride,
+      prefetch_store(dst + 32);
-                                  filter_y, h);
+      convolve_bi_vert_64_dspr2(src, src_stride,
-        break;
+                                dst, dst_stride,
-      default:
+                                filter_y, h);
-        vpx_convolve8_vert_c(src, src_stride,
+      break;
-                             dst, dst_stride,
+    default:
-                             filter_x, x_step_q4,
+      vpx_convolve8_vert_c(src, src_stride,
-                             filter_y, y_step_q4,
+                           dst, dst_stride,
-                             w, h);
+                           filter_x, x_step_q4,
-        break;
+                           filter_y, y_step_q4,
-    }
+                           w, h);
-  } else {
+      break;
    vpx_convolve8_vert_c(src, src_stride,
                         dst, dst_stride,
                         filter_x, x_step_q4,
                         filter_y, y_step_q4,
                         w, h);
  }
 }
 #endif
--- a/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -347,6 +347,7 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4,
                                  int w, int h) {
  assert(y_step_q4 == 16);
  if (((const int32_t *)filter_y)[1] == 0x800000) {
    vpx_convolve_avg(src, src_stride,
                     dst, dst_stride,
@@ -360,47 +361,39 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                 filter_y, y_step_q4,
                                 w, h);
  } else {
-    if (16 == y_step_q4) {
+    uint32_t pos = 38;
      uint32_t pos = 38;
-      /* bit positon for extract from acc */
+    /* bit positon for extract from acc */
-      __asm__ __volatile__ (
+    __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
+      "wrdsp      %[pos],     1           \n\t"
-        :
+      :
-        : [pos] "r" (pos)
+      : [pos] "r" (pos)
-      );
+    );
-      prefetch_store(dst);
+    prefetch_store(dst);
-      switch (w) {
+    switch (w) {
-        case 4:
+      case 4:
-        case 8:
+      case 8:
-        case 16:
+      case 16:
-        case 32:
+      case 32:
-          convolve_avg_vert_4_dspr2(src, src_stride,
+        convolve_avg_vert_4_dspr2(src, src_stride,
-                                    dst, dst_stride,
+                                  dst, dst_stride,
-                                    filter_y, w, h);
+                                  filter_y, w, h);
-          break;
+        break;
-        case 64:
+      case 64:
-          prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
-          convolve_avg_vert_64_dspr2(src, src_stride,
+        convolve_avg_vert_64_dspr2(src, src_stride,
                                     dst, dst_stride,
                                     filter_y, h);
          break;
        default:
          vpx_convolve8_avg_vert_c(src, src_stride,
                                   dst, dst_stride,
-                                   filter_x, x_step_q4,
+                                   filter_y, h);
-                                   filter_y, y_step_q4,
+        break;
-                                   w, h);
+      default:
-          break;
+        vpx_convolve8_avg_vert_c(src, src_stride,
-      }
+                                 dst, dst_stride,
-    } else {
+                                 filter_x, x_step_q4,
-      vpx_convolve8_avg_vert_c(src, src_stride,
+                                 filter_y, y_step_q4,
-                               dst, dst_stride,
+                                 w, h);
-                               filter_x, x_step_q4,
+        break;
                               filter_y, y_step_q4,
                               w, h);
    }
  }
 }
@@ -416,17 +409,12 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
  assert(w <= 64);
  assert(h <= 64);
  assert(x_step_q4 == 16);
  assert(y_step_q4 == 16);
  if (intermediate_height < h)
    intermediate_height = h;
  if (x_step_q4 != 16 || y_step_q4 != 16)
    return vpx_convolve8_avg_c(src, src_stride,
                               dst, dst_stride,
                               filter_x, x_step_q4,
                               filter_y, y_step_q4,
                               w, h);
  vpx_convolve8_horiz(src - (src_stride * 3), src_stride,
                      temp, 64,
                      filter_x, x_step_q4,
--- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -957,6 +957,7 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h) {
  assert(x_step_q4 == 16);
  if (((const int32_t *)filter_x)[1] == 0x800000) {
    vpx_convolve_avg(src, src_stride,
                     dst, dst_stride,
@@ -970,66 +971,58 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  filter_y, y_step_q4,
                                  w, h);
  } else {
-    if (16 == x_step_q4) {
+    uint32_t pos = 38;
      uint32_t pos = 38;
-      src -= 3;
+    src -= 3;
-      /* bit positon for extract from acc */
+    /* bit positon for extract from acc */
-      __asm__ __volatile__ (
+    __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
+      "wrdsp      %[pos],     1           \n\t"
-        :
+      :
-        : [pos] "r" (pos)
+      : [pos] "r" (pos)
-      );
+    );
-      /* prefetch data to cache memory */
+    /* prefetch data to cache memory */
-      prefetch_load(src);
+    prefetch_load(src);
-      prefetch_load(src + 32);
+    prefetch_load(src + 32);
-      prefetch_store(dst);
+    prefetch_store(dst);
-      switch (w) {
+    switch (w) {
-        case 4:
+      case 4:
-          convolve_avg_horiz_4_dspr2(src, src_stride,
+        convolve_avg_horiz_4_dspr2(src, src_stride,
-                                     dst, dst_stride,
+                                   dst, dst_stride,
-                                     filter_x, h);
+                                   filter_x, h);
-          break;
+        break;
-        case 8:
+      case 8:
-          convolve_avg_horiz_8_dspr2(src, src_stride,
+        convolve_avg_horiz_8_dspr2(src, src_stride,
-                                     dst, dst_stride,
+                                   dst, dst_stride,
-                                     filter_x, h);
+                                   filter_x, h);
-          break;
+        break;
-        case 16:
+      case 16:
-          convolve_avg_horiz_16_dspr2(src, src_stride,
+        convolve_avg_horiz_16_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_x, h, 1);
          break;
        case 32:
          convolve_avg_horiz_16_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_x, h, 2);
          break;
        case 64:
          prefetch_load(src + 64);
          prefetch_store(dst + 32);
          convolve_avg_horiz_64_dspr2(src, src_stride,
                                      dst, dst_stride,
                                      filter_x, h);
          break;
        default:
          vpx_convolve8_avg_horiz_c(src + 3, src_stride,
                                    dst, dst_stride,
-                                    filter_x, x_step_q4,
+                                    filter_x, h, 1);
-                                    filter_y, y_step_q4,
+        break;
-                                    w, h);
+      case 32:
-          break;
+        convolve_avg_horiz_16_dspr2(src, src_stride,
-      }
+                                    dst, dst_stride,
-    } else {
+                                    filter_x, h, 2);
-      vpx_convolve8_avg_horiz_c(src, src_stride,
+        break;
-                                dst, dst_stride,
+      case 64:
-                                filter_x, x_step_q4,
+        prefetch_load(src + 64);
-                                filter_y, y_step_q4,
+        prefetch_store(dst + 32);
-                                w, h);
+
        convolve_avg_horiz_64_dspr2(src, src_stride,
                                    dst, dst_stride,
                                    filter_x, h);
        break;
      default:
        vpx_convolve8_avg_horiz_c(src + 3, src_stride,
                                  dst, dst_stride,
                                  filter_x, x_step_q4,
                                  filter_y, y_step_q4,
                                  w, h);
        break;
    }
  }
 }
--- a/vpx_dsp/mips/convolve8_dspr2.c
+++ b/vpx_dsp/mips/convolve8_dspr2.c
@@ -936,6 +936,9 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
  uint32_t pos = 38;
  assert(x_step_q4 == 16);
  assert(y_step_q4 == 16);
  /* bit positon for extract from acc */
  __asm__ __volatile__ (
    "wrdsp      %[pos],     1           \n\t"
@@ -946,13 +949,6 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
  if (intermediate_height < h)
    intermediate_height = h;
  if (x_step_q4 != 16 || y_step_q4 != 16)
    return vpx_convolve8_c(src, src_stride,
                           dst, dst_stride,
                           filter_x, x_step_q4,
                           filter_y, y_step_q4,
                           w, h);
  if ((((const int32_t *)filter_x)[1] == 0x800000)
      && (((const int32_t *)filter_y)[1] == 0x800000))
    return vpx_convolve_copy(src, src_stride,
--- a/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -841,6 +841,7 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
  assert(x_step_q4 == 16);
  if (((const int32_t *)filter_x)[1] == 0x800000) {
    vpx_convolve_copy(src, src_stride,
                      dst, dst_stride,
@@ -854,67 +855,59 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              filter_y, y_step_q4,
                              w, h);
  } else {
-    if (16 == x_step_q4) {
+    uint32_t pos = 38;
      uint32_t pos = 38;
-      prefetch_load((const uint8_t *)filter_x);
+    prefetch_load((const uint8_t *)filter_x);
-      src -= 3;
+    src -= 3;
-      /* bit positon for extract from acc */
+    /* bit positon for extract from acc */
-      __asm__ __volatile__ (
+    __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
+      "wrdsp      %[pos],     1           \n\t"
-        :
+      :
-        : [pos] "r" (pos)
+      : [pos] "r" (pos)
-      );
+    );
-      /* prefetch data to cache memory */
+    /* prefetch data to cache memory */
-      prefetch_load(src);
+    prefetch_load(src);
-      prefetch_load(src + 32);
+    prefetch_load(src + 32);
-      prefetch_store(dst);
+    prefetch_store(dst);
-      switch (w) {
+    switch (w) {
-        case 4:
+      case 4:
-          convolve_horiz_4_dspr2(src, (int32_t)src_stride,
+        convolve_horiz_4_dspr2(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
+                               dst, (int32_t)dst_stride,
-                                 filter_x, (int32_t)h);
+                               filter_x, (int32_t)h);
-          break;
+        break;
-        case 8:
+      case 8:
-          convolve_horiz_8_dspr2(src, (int32_t)src_stride,
+        convolve_horiz_8_dspr2(src, (int32_t)src_stride,
-                                 dst, (int32_t)dst_stride,
+                               dst, (int32_t)dst_stride,
-                                 filter_x, (int32_t)h);
+                               filter_x, (int32_t)h);
-          break;
+        break;
-        case 16:
+      case 16:
-          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
+                                dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h, 1);
+                                filter_x, (int32_t)h, 1);
-          break;
+        break;
-        case 32:
+      case 32:
-          convolve_horiz_16_dspr2(src, (int32_t)src_stride,
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
+                                dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h, 2);
+                                filter_x, (int32_t)h, 2);
-          break;
+        break;
-        case 64:
+      case 64:
-          prefetch_load(src + 64);
+        prefetch_load(src + 64);
-          prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
-          convolve_horiz_64_dspr2(src, (int32_t)src_stride,
+        convolve_horiz_64_dspr2(src, (int32_t)src_stride,
-                                  dst, (int32_t)dst_stride,
+                                dst, (int32_t)dst_stride,
-                                  filter_x, (int32_t)h);
+                                filter_x, (int32_t)h);
-          break;
+        break;
-        default:
+      default:
-          vpx_convolve8_horiz_c(src + 3, src_stride,
+        vpx_convolve8_horiz_c(src + 3, src_stride,
-                                dst, dst_stride,
+                              dst, dst_stride,
-                                filter_x, x_step_q4,
+                              filter_x, x_step_q4,
-                                filter_y, y_step_q4,
+                              filter_y, y_step_q4,
-                                w, h);
+                              w, h);
-          break;
+        break;
      }
    } else {
      vpx_convolve8_horiz_c(src, src_stride,
                            dst, dst_stride,
                            filter_x, x_step_q4,
                            filter_y, y_step_q4,
                            w, h);
    }
  }
 }
--- a/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -333,6 +333,7 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
  assert(y_step_q4 == 16);
  if (((const int32_t *)filter_y)[1] == 0x800000) {
    vpx_convolve_copy(src, src_stride,
                      dst, dst_stride,
@@ -346,47 +347,39 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             filter_y, y_step_q4,
                             w, h);
  } else {
-    if (16 == y_step_q4) {
+    uint32_t pos = 38;
      uint32_t pos = 38;
-      /* bit positon for extract from acc */
+    /* bit positon for extract from acc */
-      __asm__ __volatile__ (
+    __asm__ __volatile__ (
-        "wrdsp      %[pos],     1           \n\t"
+      "wrdsp      %[pos],     1           \n\t"
-        :
+      :
-        : [pos] "r" (pos)
+      : [pos] "r" (pos)
-      );
+    );
-      prefetch_store(dst);
+    prefetch_store(dst);
-      switch (w) {
+    switch (w) {
-        case 4 :
+      case 4 :
-        case 8 :
+      case 8 :
-        case 16 :
+      case 16 :
-        case 32 :
+      case 32 :
-          convolve_vert_4_dspr2(src, src_stride,
+        convolve_vert_4_dspr2(src, src_stride,
-                                dst, dst_stride,
+                              dst, dst_stride,
-                                filter_y, w, h);
+                              filter_y, w, h);
-          break;
+        break;
-        case 64 :
+      case 64 :
-          prefetch_store(dst + 32);
+        prefetch_store(dst + 32);
-          convolve_vert_64_dspr2(src, src_stride,
+        convolve_vert_64_dspr2(src, src_stride,
                                 dst, dst_stride,
                                 filter_y, h);
          break;
        default:
          vpx_convolve8_vert_c(src, src_stride,
                               dst, dst_stride,
-                               filter_x, x_step_q4,
+                               filter_y, h);
-                               filter_y, y_step_q4,
+        break;
-                               w, h);
+      default:
-          break;
+        vpx_convolve8_vert_c(src, src_stride,
-      }
+                             dst, dst_stride,
-    } else {
+                             filter_x, x_step_q4,
-      vpx_convolve8_vert_c(src, src_stride,
+                             filter_y, y_step_q4,
-                           dst, dst_stride,
+                             w, h);
-                           filter_x, x_step_q4,
+        break;
                           filter_y, y_step_q4,
                           w, h);
    }
  }
 }
--- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
@@ -665,12 +666,7 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                 int w, int h) {
  int8_t cnt, filt_hor[8];
-  if (16 != x_step_q4) {
+  assert(x_step_q4 == 16);
    vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
                              filter_x, x_step_q4, filter_y, y_step_q4,
                              w, h);
    return;
  }
  if (((const int32_t *)filter_x)[1] == 0x800000) {
    vpx_convolve_avg(src, src_stride, dst, dst_stride,
--- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
@@ -574,12 +575,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                           int w, int h) {
  int8_t cnt, filt_hor[8], filt_ver[8];
-  if (16 != x_step_q4 || 16 != y_step_q4) {
+  assert(x_step_q4 == 16);
-    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
+  assert(y_step_q4 == 16);
                        filter_x, x_step_q4, filter_y, y_step_q4,
                        w, h);
    return;
  }
  if (((const int32_t *)filter_x)[1] == 0x800000 &&
      ((const int32_t *)filter_y)[1] == 0x800000) {
--- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
@@ -639,12 +640,7 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                int w, int h) {
  int8_t cnt, filt_ver[8];
-  if (16 != y_step_q4) {
+  assert(y_step_q4 == 16);
    vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
                             filter_x, x_step_q4, filter_y, y_step_q4,
                             w, h);
    return;
  }
  if (((const int32_t *)filter_y)[1] == 0x800000) {
    vpx_convolve_avg(src, src_stride, dst, dst_stride,
--- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
@@ -625,12 +626,7 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                             int w, int h) {
  int8_t cnt, filt_hor[8];
-  if (16 != x_step_q4) {
+  assert(x_step_q4 == 16);
    vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
                          filter_x, x_step_q4, filter_y, y_step_q4,
                          w, h);
    return;
  }
  if (((const int32_t *)filter_x)[1] == 0x800000) {
    vpx_convolve_copy(src, src_stride, dst, dst_stride,
--- a/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
@@ -548,12 +549,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
                       int32_t w, int32_t h) {
  int8_t cnt, filt_hor[8], filt_ver[8];
-  if (16 != x_step_q4 || 16 != y_step_q4) {
+  assert(x_step_q4 == 16);
-    vpx_convolve8_c(src, src_stride, dst, dst_stride,
+  assert(y_step_q4 == 16);
                    filter_x, x_step_q4, filter_y, y_step_q4,
                    w, h);
    return;
  }
  if (((const int32_t *)filter_x)[1] == 0x800000 &&
      ((const int32_t *)filter_y)[1] == 0x800000) {
--- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <assert.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/vpx_convolve_msa.h"
@@ -632,12 +633,7 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                            int w, int h) {
  int8_t cnt, filt_ver[8];
-  if (16 != y_step_q4) {
+  assert(y_step_q4 == 16);
    vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
                         filter_x, x_step_q4, filter_y, y_step_q4,
                         w, h);
    return;
  }
  if (((const int32_t *)filter_y)[1] == 0x800000) {
    vpx_convolve_copy(src, src_stride, dst, dst_stride,