Remove get_filter_base() and get_filter_offset() in convolve

so that the convolve functions are independent of table alignment. Change-Id: Ieab132a30d72c6e75bbe9473544fbe2cf51541ee
2017-08-28 10:35:43 -07:00
parent d49a1a5329
commit d331e7a1c0
48 changed files with 760 additions and 816 deletions
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -33,9 +33,9 @@ static const unsigned int kMaxDimension = 64;

 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h);
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h);

 typedef void (*WrapperFilterBlock2d8Func)(
    const uint8_t *src_ptr, const unsigned int src_stride,
@@ -550,7 +550,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) {

  vpx_usec_timer_start(&timer);
  for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
                   width, height);
  }
  vpx_usec_timer_mark(&timer);
@@ -570,7 +570,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) {

  vpx_usec_timer_start(&timer);
  for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
                   width, height);
  }
  vpx_usec_timer_mark(&timer);
@@ -585,7 +585,7 @@ TEST_P(ConvolveTest, Copy) {
  uint8_t *const out = output();

  ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          NULL, 0, 0, 0, 0, Width(), Height()));

  CheckGuardBlocks();

@@ -604,7 +604,7 @@ TEST_P(ConvolveTest, Avg) {
  CopyOutputToRef();

  ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          NULL, 0, 0, 0, 0, Width(), Height()));

  CheckGuardBlocks();

@@ -621,12 +621,10 @@ TEST_P(ConvolveTest, Avg) {
 TEST_P(ConvolveTest, CopyHoriz) {
  uint8_t *const in = input();
  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

  ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));

  CheckGuardBlocks();

@@ -641,12 +639,10 @@ TEST_P(ConvolveTest, CopyHoriz) {
 TEST_P(ConvolveTest, CopyVert) {
  uint8_t *const in = input();
  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

  ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));

  CheckGuardBlocks();

@@ -661,12 +657,10 @@ TEST_P(ConvolveTest, CopyVert) {
 TEST_P(ConvolveTest, Copy2D) {
  uint8_t *const in = input();
  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

  ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,
-                                          filter8, 16, filter8, 16, Width(),
-                                          Height()));
+                                          vp9_filter_kernels[0], 0, 16, 0, 16,
+                                          Width(), Height()));

  CheckGuardBlocks();

@@ -702,7 +696,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
  }
 }

-const int16_t kInvalidFilter[8] = { 0 };
 const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {
  wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c
 };
@@ -755,21 +748,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
                                      Width(), Height(), UUT_->use_highbd_);

          if (filter_x && filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,
+                              filter_x, 16, filter_y, 16, Width(), Height()));
          else if (filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->v8_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,
+                             16, filter_y, 16, Width(), Height()));
          else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                kInvalidFilter, 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
+                             filter_x, 16, 0, 16, Width(), Height()));
          else
-            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                kInvalidFilter, 0, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,
+                                                    kOutputStride, NULL, 0, 0,
+                                                    0, 0, Width(), Height()));

          CheckGuardBlocks();

@@ -853,21 +846,21 @@ TEST_P(ConvolveTest, FilterExtremes) {
                                       filters[filter_y], ref, kOutputStride,
                                       Width(), Height(), UUT_->use_highbd_);
            if (filter_x && filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,
+                                filter_x, 16, filter_y, 16, Width(), Height()));
            else if (filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->v8_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,
+                               16, filter_y, 16, Width(), Height()));
            else if (filter_x)
-              ASM_REGISTER_STATE_CHECK(UUT_->h8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  kInvalidFilter, 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
+                               filter_x, 16, 0, 16, Width(), Height()));
            else
-              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                  kInvalidFilter, 0, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,
+                                                      kOutputStride, NULL, 0, 0,
+                                                      0, 0, Width(), Height()));

            for (int y = 0; y < Height(); ++y) {
              for (int x = 0; x < Width(); ++x)
@@ -897,8 +890,8 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
    for (int step = 1; step <= 32; ++step) {
      /* Test the horizontal and vertical filters in combination. */
      ASM_REGISTER_STATE_CHECK(
-          UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac],
-                         step, eighttap[frac], step, Width(), Height()));
+          UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, frac,
+                         step, frac, step, Width(), Height()));

      CheckGuardBlocks();

@@ -917,14 +910,14 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH
-#define WRAP(func, bd)                                                         \
-  void wrap_##func##_##bd(                                                     \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,      \
-      const int16_t *filter_y, int filter_y_stride, int w, int h) {            \
-    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,     \
-                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter_x, \
-                      filter_x_stride, filter_y, filter_y_stride, w, h, bd);   \
+#define WRAP(func, bd)                                                       \
+  void wrap_##func##_##bd(                                                   \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,   \
+                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);         \
  }

 #if HAVE_SSE2 && ARCH_X86_64
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
                                   const struct scale_factors *sf, int w, int h,
                                   int ref, const InterpKernel *kernel, int xs,
                                   int ys) {
-  sf->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h);
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,
+                                                 dst_stride, kernel, subpel_x,
+                                                 xs, subpel_y, ys, w, h);
 }

 #if CONFIG_VP9_HIGHBITDEPTH
@@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor(
    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
    int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h, bd);
+      src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,
+      h, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -390,12 +390,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
  }

  if (decision == FILTER_BLOCK) {
-    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                      num_4x4_blocks_high_lookup[bs] << 2);
  } else {  // COPY_BLOCK
-    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                      num_4x4_blocks_high_lookup[bs] << 2);
  }
  *denoiser_decision = decision;
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2645,15 +2645,14 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,

        if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
          vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
-                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
-                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
-                               16 / factor, 16 / factor, bd);
+                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,
+                               x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                               16 * src_h / dst_h, 16 / factor, 16 / factor,
+                               bd);
        } else {
-          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                        kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                        16 / factor);
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
        }
      }
    }
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -43,10 +43,9 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
                                 (x / factor) * src_w / dst_w;
        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);

-        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                      kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                      16 / factor);
+        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                      x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                      16 * src_h / dst_h, 16 / factor, 16 / factor);
      }
    }
  }
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -2162,15 +2162,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
          vpx_highbd_convolve_copy(
              CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
              CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
-              NULL, 0, NULL, 0, bw, bh, xd->bd);
+              NULL, 0, 0, 0, 0, bw, bh, xd->bd);
        else
          vpx_convolve_copy(best_pred->data, best_pred->stride,
                            this_mode_pred->data, this_mode_pred->stride, NULL,
-                            0, NULL, 0, bw, bh);
+                            0, 0, 0, 0, bw, bh);
 #else
        vpx_convolve_copy(best_pred->data, best_pred->stride,
                          this_mode_pred->data, this_mode_pred->stride, NULL, 0,
-                          NULL, 0, bw, bh);
+                          0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
        best_pred = this_mode_pred;
      }
@@ -2264,14 +2264,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
      if (cm->use_highbitdepth)
        vpx_highbd_convolve_copy(
            CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
-            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0,
+            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,
            bw, bh, xd->bd);
      else
        vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                          pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                          pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #else
      vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                        pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                        pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    }
  }
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -600,7 +600,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 #if CONFIG_VP9_HIGHBITDEPTH
      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
        vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
-                                 32, NULL, 0, NULL, 0, bs, bs, xd->bd);
+                                 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
        if (xd->lossless) {
          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
        } else {
@@ -623,7 +623,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
        recon = CONVERT_TO_BYTEPTR(recon16);
      } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
+        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
        switch (tx_size) {
          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
--- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -137,15 +137,14 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,

 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
                                     uint16_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y,  // unused
-                                     int y_step_q4,            // unused
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                     int w, int h, int bd) {
  if (x_step_q4 != 16) {
-    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
  } else {
-    const int16x8_t filters = vld1q_s16(filter_x);
+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    uint16x8_t t0, t1, t2, t3;

@@ -337,15 +336,15 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
 void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
                                         ptrdiff_t src_stride, uint16_t *dst,
                                         ptrdiff_t dst_stride,
-                                         const int16_t *filter_x, int x_step_q4,
-                                         const int16_t *filter_y,  // unused
-                                         int y_step_q4,            // unused
-                                         int w, int h, int bd) {
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h, int bd) {
  if (x_step_q4 != 16) {
-    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                     x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
  } else {
-    const int16x8_t filters = vld1q_s16(filter_x);
+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
    uint16x8_t t0, t1, t2, t3;

@@ -566,15 +565,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,

 void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x,  // unused
-                                    int x_step_q4,            // unused
-                                    const int16_t *filter_y, int y_step_q4,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h, int bd) {
  if (y_step_q4 != 16) {
-    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
  } else {
-    const int16x8_t filters = vld1q_s16(filter_y);
+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

    assert(!((intptr_t)dst & 3));
@@ -732,15 +730,15 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
 void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
                                        ptrdiff_t src_stride, uint16_t *dst,
                                        ptrdiff_t dst_stride,
-                                        const int16_t *filter_x,  // unused
-                                        int x_step_q4,            // unused
-                                        const int16_t *filter_y, int y_step_q4,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
                                        int w, int h, int bd) {
  if (y_step_q4 != 16) {
-    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                    x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                    bd);
  } else {
-    const int16x8_t filters = vld1q_s16(filter_y);
+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

    assert(!((intptr_t)dst & 3));
--- a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -15,13 +15,14 @@

 void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
                                  uint16_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                  int w, int h, int bd) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
  (void)bd;

  if (w < 8) {  // avg4
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -15,13 +15,14 @@

 void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h, int bd) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
  (void)bd;

  if (w < 8) {  // copy4
--- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -15,10 +15,9 @@

 void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
                               uint16_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h, int bd) {
-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
  // + 1 to make it divisible by 4
  uint16_t temp[64 * 136];
  const int intermediate_height =
@@ -29,20 +28,19 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
   * buffer which has lots of extra room and is subsequently discarded this is
   * safe if somewhat less than ideal.   */
  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                                  intermediate_height, bd);

  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
 }

 void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h, int bd) {
-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
  // + 1 to make it divisible by 4
  uint16_t temp[64 * 136];
  const int intermediate_height =
@@ -52,8 +50,9 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
   * to average the values after both passes.
   */
  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                                  intermediate_height, bd);
-  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
-                                     x_step_q4, filter_y, y_step_q4, w, h, bd);
+  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
 }
--- a/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -42,10 +42,11 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
 ; sp[]int w
 ; sp[]int h

@@ -54,11 +55,11 @@

    sub             r0, r0, #3              ; adjust for taps

-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
+    add             r4, r5, lsl #4
+    ldrd            r6, r7, [sp, #52]       ; w, h

-    vld1.s16        {q0}, [r5]              ; filter_x
+    vld1.s16        {q0}, [r4]              ; filter

    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz

    sub             r2, r2, r3, lsl #2      ; reset for store

-    ; src[] * filter_x
+    ; src[] * filter
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r4, [sp, #24]           ; filter
+    ldr             r5, [sp, #36]           ; y0_q4
+    add             r4, r5, lsl #4
+    ldr             r6, [sp, #44]           ; w
+    ldr             lr, [sp, #48]           ; h

-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter

    lsl             r1, r1, #1
    lsl             r3, r3, #1
@@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert
    pld             [r7]
    pld             [r4]

-    ; src[] * filter_y
+    ; src[] * filter
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r7, r1]
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -125,11 +125,10 @@ static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2,

 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y,  // unused
-                              int y_step_q4,            // unused
-                              int w, int h) {
-  const int16x8_t filters = vld1q_s16(filter_x);
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
  uint8x8_t t0, t1, t2, t3;

  assert(!((intptr_t)dst & 3));
@@ -137,8 +136,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
  assert(x_step_q4 == 16);

  (void)x_step_q4;
+  (void)y0_q4;
  (void)y_step_q4;
-  (void)filter_y;

  src -= 3;

@@ -390,11 +389,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,

 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y,  // unused
-                                  int y_step_q4,            // unused
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                  int w, int h) {
-  const int16x8_t filters = vld1q_s16(filter_x);
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
  uint8x8_t t0, t1, t2, t3;

  assert(!((intptr_t)dst & 3));
@@ -402,8 +400,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
  assert(x_step_q4 == 16);

  (void)x_step_q4;
+  (void)y0_q4;
  (void)y_step_q4;
-  (void)filter_y;

  src -= 3;

@@ -692,19 +690,18 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,

 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,  // unused
-                             int x_step_q4,            // unused
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  const int16x8_t filters = vld1q_s16(filter_y);
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);

  assert(!((intptr_t)dst & 3));
  assert(!(dst_stride & 3));
  assert(y_step_q4 == 16);

+  (void)x0_q4;
  (void)x_step_q4;
  (void)y_step_q4;
-  (void)filter_x;

  src -= 3 * src_stride;

@@ -864,19 +861,18 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,  // unused
-                                 int x_step_q4,            // unused
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
-  const int16x8_t filters = vld1q_s16(filter_y);
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);

  assert(!((intptr_t)dst & 3));
  assert(!(dst_stride & 3));
  assert(y_step_q4 == 16);

+  (void)x0_q4;
  (void)x_step_q4;
  (void)y_step_q4;
-  (void)filter_x;

  src -= 3 * src_stride;

--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -42,10 +42,11 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
 ; sp[]int w
 ; sp[]int h

@@ -54,11 +55,11 @@

    sub             r0, r0, #3              ; adjust for taps

-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
+    add             r4, r5, lsl #4
+    ldrd            r6, r7, [sp, #52]       ; w, h

-    vld1.s16        {q0}, [r5]              ; filter_x
+    vld1.s16        {q0}, [r4]              ; filter

    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
    add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -119,7 +120,7 @@ vpx_convolve8_loop_horiz

    pld             [r5, r1, lsl #1]

-    ; src[] * filter_x
+    ; src[] * filter
    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -173,11 +174,13 @@ vpx_convolve8_loop_horiz
    sub             r0, r0, r1
    sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r4, [sp, #24]           ; filter
+    ldr             r5, [sp, #36]           ; y0_q4
+    add             r4, r5, lsl #4
+    ldr             r6, [sp, #44]           ; w
+    ldr             lr, [sp, #48]           ; h

-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter

    lsl             r1, r1, #1
    lsl             r3, r3, #1
@@ -216,7 +219,7 @@ vpx_convolve8_loop_vert
    pld             [r5]
    pld             [r8]

-    ; src[] * filter_y
+    ; src[] * filter
    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

    pld             [r5, r3]
--- a/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -15,13 +15,13 @@

 void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int filter_x_stride,
-                           const int16_t *filter_y, int filter_y_stride, int w,
-                           int h) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  if (w < 8) {  // avg4
    uint8x8_t s0, s1;
--- a/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -17,7 +17,7 @@

 |vpx_convolve_avg_neon| PROC
    push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
+    ldrd                r4, r5, [sp, #36]
    mov                 r6, r2

    cmp                 r4, #32
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -15,13 +15,14 @@

 void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                            int h) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  if (w < 8) {  // copy4
    do {
--- a/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -17,7 +17,7 @@

 |vpx_convolve_copy_neon| PROC
    push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
+    ldrd                r4, r5, [sp, #32]

    cmp                 r4, #32
    bgt                 copy64
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -15,8 +15,8 @@
 #include "vpx_ports/mem.h"

 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                        int w, int h) {
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
@@ -33,19 +33,19 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   * height and filter a multiple of 4 lines. Since this goes in to the temp
   * buffer which has lots of extra room and is subsequently discarded this is
   * safe if somewhat less than ideal.   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                           intermediate_height);

  /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4,
-                          filter_y, y_step_q4, w, h);
+  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
 }

 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                            int h) {
  uint8_t temp[64 * 72];
  const int intermediate_height = h + 7;
@@ -56,9 +56,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                           intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
 }
--- a/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,

 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
  uint32_t pos = 38;

  assert(y_step_q4 == 16);
@@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    h);
      break;
    default:
-      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h);
      break;
  }
 }
--- a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,

 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
  uint32_t pos = 38;

  assert(x_step_q4 == 16);
@@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                     h);
      break;
    default:
-      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h);
      break;
  }
 }
--- a/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,

 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
  uint32_t pos = 38;

  assert(x_step_q4 == 16);
@@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                 (int32_t)dst_stride, filter_x, (int32_t)h);
      break;
    default:
-      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
      break;
  }
 }
--- a/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,

 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
  uint32_t pos = 38;

  assert(y_step_q4 == 16);
@@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
      break;
    default:
-      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
      break;
  }
 }
--- a/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -334,15 +334,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,

 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
  assert(y_step_q4 == 16);
  assert(((const int32_t *)filter_y)[1] != 0x800000);

  if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
  } else {
    uint32_t pos = 38;

@@ -367,8 +368,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   h);
        break;
      default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
@@ -376,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
  /* Fixed size intermediate buffer places limits on parameters. */
  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
@@ -390,24 +391,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,

  if (intermediate_height < h) intermediate_height = h;

-  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
-                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                      intermediate_height);

-  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                         x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
 }

 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                            int h) {
  int x, y;
  uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  /* prefetch data to cache memory */
  prefetch_load(src);
--- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -938,15 +938,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,

 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
  assert(x_step_q4 == 16);
  assert(((const int32_t *)filter_x)[1] != 0x800000);

  if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
  } else {
    uint32_t pos = 38;

@@ -987,9 +988,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    h);
        break;
      default:
-        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  h);
+        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/convolve8_dspr2.c
+++ b/vpx_dsp/mips/convolve8_dspr2.c
@@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
 }

 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
  uint32_t pos = 38;
@@ -1395,14 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h) {
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
  int x, y;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  /* prefetch data to cache memory */
  prefetch_load(src);
--- a/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,

 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h) {
+  const int16_t *const filter_x = filter[x0_q4];
  assert(x_step_q4 == 16);
  assert(((const int32_t *)filter_x)[1] != 0x800000);

  if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
  } else {
    uint32_t pos = 38;

@@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                (int32_t)dst_stride, filter_x, (int32_t)h);
        break;
      default:
-        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,

 void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
+  const int16_t *const filter_y = filter[y0_q4];
  assert(y_step_q4 == 16);
  assert(((const int32_t *)filter_y)[1] != 0x800000);

  if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
  } else {
    uint32_t pos = 38;

@@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
        convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
        break;
      default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/vpx_dsp/mips/convolve_common_dspr2.h
@@ -24,21 +24,21 @@ extern "C" {
 #if HAVE_DSPR2
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h);
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h);

 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h);

 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h);
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h);

 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const int16_t *filter, int w,
@@ -46,9 +46,9 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h);
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h);

 #endif  // #if HAVE_DSPR2
 #ifdef __cplusplus
--- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -633,9 +633,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,

 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
+  const int16_t *const filter_x = filter[x0_q4];
  int8_t cnt, filt_hor[8];

  assert(x_step_q4 == 16);
@@ -668,8 +669,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                          (int32_t)dst_stride, &filt_hor[3], h);
        break;
      default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  } else {
@@ -695,8 +696,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                          (int32_t)dst_stride, filt_hor, h);
        break;
      default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -516,9 +516,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(

 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
  int8_t cnt, filt_hor[8], filt_ver[8];

  assert(x_step_q4 == 16);
@@ -560,14 +561,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                               &filt_hor[3], &filt_ver[3], h);
        break;
      default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  } else if (((const int32_t *)filter_x)[0] == 0 ||
             ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
  } else {
    switch (w) {
      case 4:
@@ -596,8 +597,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                               filt_ver, h);
        break;
      default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -605,9 +605,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,

 void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h) {
+  const int16_t *const filter_y = filter[y0_q4];
  int8_t cnt, filt_ver[8];

  assert(y_step_q4 == 16);
@@ -640,8 +641,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                          (int32_t)dst_stride, &filt_ver[3], h);
        break;
      default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  } else {
@@ -668,8 +669,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                          (int32_t)dst_stride, filt_ver, h);
        break;
      default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,

 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
+  const int16_t *const filter_x = filter[x0_q4];
  int8_t cnt, filt_hor[8];

  assert(x_step_q4 == 16);
@@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                             &filt_hor[3], h);
        break;
      default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  } else {
@@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                             filt_hor, h);
        break;
      default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
 }

 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int32_t x_step_q4, const int16_t *filter_y,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
                       int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
  int8_t cnt, filt_hor[8], filt_ver[8];

  assert(x_step_q4 == 16);
@@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                  &filt_ver[3], (int32_t)h);
        break;
      default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  } else if (((const int32_t *)filter_x)[0] == 0 ||
             ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                    filter_y, y_step_q4, w, h);
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
  } else {
    switch (w) {
      case 4:
@@ -621,8 +623,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                  (int32_t)h);
        break;
      default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,

 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                            int h) {
+  const int16_t *const filter_y = filter[y0_q4];
  int8_t cnt, filt_ver[8];

  assert(y_step_q4 == 16);
@@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                             &filt_ver[3], h);
        break;
      default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  } else {
@@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                             filt_ver, h);
        break;
      default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
        break;
    }
  }
--- a/vpx_dsp/mips/vpx_convolve_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -189,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,

 void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                          int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  switch (w) {
    case 4: {
--- a/vpx_dsp/mips/vpx_convolve_copy_msa.c
+++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -199,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride,

 void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  switch (w) {
    case 4: {
--- a/vpx_dsp/ppc/vpx_convolve_vsx.c
+++ b/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -53,13 +53,13 @@ static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,

 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
-                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  switch (w) {
    case 16: {
@@ -132,14 +132,8 @@ static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,

 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
-                          int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
  switch (w) {
    case 16: {
      avg_w16(src, src_stride, dst, dst_stride, h);
@@ -154,8 +148,8 @@ void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
      break;
    }
    default: {
-      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                         filter_x_stride, filter_y, filter_y_stride, w, h);
+      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
      break;
    }
  }
@@ -299,9 +293,9 @@ static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,

 static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h) {
+                            const InterpKernel *const filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -324,95 +318,77 @@ static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
  assert(x_step_q4 <= 32);

  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
 }

 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
  (void)y_step_q4;

-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
 }

 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
  (void)y_step_q4;

-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
 }

 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                            int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
  (void)x_step_q4;

-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
 }

 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
  (void)x_step_q4;

-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
 }

 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                       int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
+  convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+           y_step_q4, w, h);
 }

 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
  // Fixed size intermediate buffer places limits on parameters.
  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
  assert(w <= 64);
  assert(h <= 64);

-  vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+  vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                    y_step_q4, w, h);
-  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -114,10 +114,9 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
 }

 static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters, int y0_q4,
-                     int y_step_q4, int w, int h) {
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -140,108 +139,86 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
  assert(x_step_q4 <= 32);

  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
 }

 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)y0_q4;
  (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
 }

 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
  (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
 }

 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4, int w,
-                          int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  (void)x0_q4;
  (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
 }

 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
  (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
 }

 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+           y_step_q4, w, h);
 }

 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
  // Fixed size intermediate buffer places limits on parameters.
  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
  assert(w <= 64);
  assert(h <= 64);

-  vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+  vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                  y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }

 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int filter_x_stride, const int16_t *filter_y,
-                         int filter_y_stride, int w, int h) {
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
  int r;

-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  for (r = h; r > 0; --r) {
    memcpy(dst, src, w);
@@ -251,15 +228,16 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }

 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int filter_x_stride, const int16_t *filter_y,
-                        int filter_y_stride, int w, int h) {
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
  int x, y;

-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;

  for (y = 0; y < h; ++y) {
    for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
@@ -269,53 +247,52 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }

 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                        int w, int h) {
-  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
 }

 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                       int w, int h) {
-  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
+  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, y0_q4, y_step_q4, w, h);
 }

 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h);
 }

 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                            int h) {
-  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
 }

 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
 }

 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                      x_step_q4, y0_q4, y_step_q4, w, h);
 }

 #if CONFIG_VP9_HIGHBITDEPTH
@@ -417,9 +394,9 @@ static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,

 static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
                            uint16_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h, int bd) {
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h, int bd) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -442,113 +419,97 @@ static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
  assert(x_step_q4 <= 32);

  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        temp, 64, x_filters, x0_q4, x_step_q4, w,
+                        temp, 64, filter, x0_q4, x_step_q4, w,
                        intermediate_height, bd);
  highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                       y_filters, y0_q4, y_step_q4, w, h, bd);
+                       filter, y0_q4, y_step_q4, w, h, bd);
 }

 void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
                                  uint16_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h, int bd) {
+  (void)y0_q4;
  (void)y_step_q4;

-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                        x_step_q4, w, h, bd);
 }

 void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
                                      uint16_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
  (void)y_step_q4;

-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                            x_step_q4, w, h, bd);
 }

 void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
                                 uint16_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
  (void)x_step_q4;

-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                       y_step_q4, w, h, bd);
 }

 void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
                                     uint16_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                     int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
  (void)x_step_q4;

-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                           y_step_q4, w, h, bd);
 }

 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
                            uint16_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                            int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                  filters_y, y0_q4, y_step_q4, w, h, bd);
+  highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h, bd);
 }

 void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
                                uint16_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h, int bd) {
  // Fixed size intermediate buffer places limits on parameters.
  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
  assert(w <= 64);
  assert(h <= 64);

-  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4,
-                         filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h,
+  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                         y0_q4, y_step_q4, w, h, bd);
+  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,
                            bd);
 }

 void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
                                uint16_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h, int bd) {
  int r;

-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
  (void)bd;

  for (r = h; r > 0; --r) {
@@ -560,15 +521,16 @@ void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,

 void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
                               uint16_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int filter_x_stride,
-                               const int16_t *filter_y, int filter_y_stride,
-                               int w, int h, int bd) {
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
  int x, y;

-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
  (void)bd;

  for (y = 0; y < h; ++y) {
--- a/vpx_dsp/vpx_convolve.h
+++ b/vpx_dsp/vpx_convolve.h
@@ -19,15 +19,15 @@ extern "C" {

 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h);

 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
                                     uint16_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                     int w, int h, int bd);
 #endif

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -6,6 +6,7 @@ print <<EOF

 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"

 EOF
 }
@@ -331,69 +332,69 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 # Sub Pixel Filters
 #
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;

-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3/;

-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
  #
  # Sub Pixel Filters
  #
-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;

-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;

-  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
  specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -26,17 +26,6 @@ extern "C" {

 typedef int16_t InterpKernel[SUBPEL_TAPS];

-static INLINE const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static INLINE int get_filter_offset(const int16_t *f,
-                                    const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -20,14 +20,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                uint8_t *output_ptr, ptrdiff_t out_pitch,
                                uint32_t output_height, const int16_t *filter);

-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
  void vpx_convolve8_##name##_##opt(                                         \
      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    (void)filter_x;                                                          \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    const int16_t *filter = filter_kernel[offset];                           \
+    (void)x0_q4;                                                             \
    (void)x_step_q4;                                                         \
-    (void)filter_y;                                                          \
+    (void)y0_q4;                                                             \
    (void)y_step_q4;                                                         \
    assert(filter[3] != 128);                                                \
    assert(step_q4 == 16);                                                   \
@@ -64,32 +65,36 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
    }                                                                        \
  }

-#define FUN_CONV_2D(avg, opt)                                                 \
-  void vpx_convolve8_##avg##opt(                                              \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                 \
-    assert(filter_x[3] != 128);                                               \
-    assert(filter_y[3] != 128);                                               \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    assert(x_step_q4 == 16);                                                  \
-    assert(y_step_q4 == 16);                                                  \
-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                            \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                          \
-      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w,  \
-                                h + 7);                                       \
-      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,   \
-                                      filter_x, x_step_q4, filter_y,          \
-                                      y_step_q4, w, h);                       \
-    } else {                                                                  \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                          \
-      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x,        \
-                                x_step_q4, filter_y, y_step_q4, w, h + 1);    \
-      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x,  \
-                                      x_step_q4, filter_y, y_step_q4, w, h);  \
-    }                                                                         \
+#define FUN_CONV_2D(avg, opt)                                                  \
+  void vpx_convolve8_##avg##opt(                                               \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    const int16_t *filter_y = filter[y0_q4];                                   \
+    (void)filter_y;                                                            \
+    assert(filter_x[3] != 128);                                                \
+    assert(filter_y[3] != 128);                                                \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    assert(x_step_q4 == 16);                                                   \
+    assert(y_step_q4 == 16);                                                   \
+    if (filter_x[0] | filter_x[1] | filter_x[2]) {                             \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
+      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
+                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+                                h + 7);                                        \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
+                                      filter, x0_q4, x_step_q4, y0_q4,         \
+                                      y_step_q4, w, h);                        \
+    } else {                                                                   \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                           \
+      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
+                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
+      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
+                                      h);                                      \
+    }                                                                          \
  }

 #if CONFIG_VP9_HIGHBITDEPTH
@@ -101,95 +106,97 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                       unsigned int output_height,
                                       const int16_t *filter, int bd);

-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_highbd_convolve8_##name##_##opt(                               \
-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,           \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \
-    if (step_q4 == 16 && filter[3] != 128) {                              \
-      if (filter[0] | filter[1] | filter[2]) {                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      } else {                                                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      }                                                                   \
-    }                                                                     \
-    if (w) {                                                              \
-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,   \
-                                      filter_x, x_step_q4, filter_y,      \
-                                      y_step_q4, w, h, bd);               \
-    }                                                                     \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \
+  void vpx_highbd_convolve8_##name##_##opt(                                   \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
+    const int16_t *filter = filter_kernel[offset];                            \
+    if (step_q4 == 16 && filter[3] != 128) {                                  \
+      if (filter[0] | filter[1] | filter[2]) {                                \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+      } else {                                                                \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    if (w) {                                                                  \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
+                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
+                                      y_step_q4, w, h, bd);                   \
+    }                                                                         \
  }

-#define HIGH_FUN_CONV_2D(avg, opt)                                            \
-  void vpx_highbd_convolve8_##avg##opt(                                       \
-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                       \
-        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \
-                                         fdata2, 64, filter_x, x_step_q4,     \
-                                         filter_y, y_step_q4, w, h + 7, bd);  \
-        vpx_highbd_convolve8_##avg##vert_##opt(                               \
-            fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \
-            y_step_q4, w, h, bd);                                             \
-      } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                       \
-        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64,         \
-                                         filter_x, x_step_q4, filter_y,       \
-                                         y_step_q4, w, h + 1, bd);            \
-        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,   \
-                                               filter_x, x_step_q4, filter_y, \
-                                               y_step_q4, w, h, bd);          \
-      }                                                                       \
-    } else {                                                                  \
-      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd);                                \
-    }                                                                         \
+#define HIGH_FUN_CONV_2D(avg, opt)                                             \
+  void vpx_highbd_convolve8_##avg##opt(                                        \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
+      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
+        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
+                                         fdata2, 64, filter, x0_q4, x_step_q4, \
+                                         y0_q4, y_step_q4, w, h + 7, bd);      \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
+            y0_q4, y_step_q4, w, h, bd);                                       \
+      } else {                                                                 \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \
+        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
+                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \
+                                         w, h + 1, bd);                        \
+        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
+                                               filter, x0_q4, x_step_q4,       \
+                                               y0_q4, y_step_q4, w, h, bd);    \
+      }                                                                        \
+    } else {                                                                   \
+      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
+                                    bd);                                       \
+    }                                                                          \
  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -18,13 +18,14 @@

 void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
                                   int width, int h, int bd) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
  (void)bd;

  assert(width % 4 == 0);
@@ -99,13 +100,14 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,

 void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                  uint16_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                  int width, int h, int bd) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
  (void)bd;

  assert(width % 4 == 0);
@@ -1073,8 +1075,8 @@ void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2

-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
 HIGH_FUN_CONV_2D(, avx2);

 void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
@@ -1098,8 +1100,8 @@ void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
  vpx_highbd_filter_block1d4_v2_avg_sse2

-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
                 avx2);
 HIGH_FUN_CONV_2D(avg_, avx2);

--- a/vpx_dsp/x86/vpx_asm_stubs.c
+++ b/vpx_dsp/x86/vpx_asm_stubs.c
@@ -41,38 +41,38 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;

 // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
 // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
 // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
 // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
+//                                  const InterpKernel *filter, int x0_q4,
+//                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);

 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
+//                         const InterpKernel *filter, int x0_q4,
+//                         int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                         int w, int h);
 // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
+//                             const InterpKernel *filter, int x0_q4,
+//                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
 FUN_CONV_2D(, sse2);
 FUN_CONV_2D(avg_, sse2);
@@ -140,22 +140,22 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
                 sse2);

 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h, int bd);
 // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h, int bd);
 HIGH_FUN_CONV_2D(, sse2);
 HIGH_FUN_CONV_2D(avg_, sse2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -20,14 +20,14 @@ SECTION .text
 %endif
 %ifidn %2, highbd
 %define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                              dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
+                                              f, fxo, fxs, fyo, fys, w, h, bd
 %else
 %define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                           dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
+                                           f, fxo, fxs, fyo, fys, w, h
 %endif
  mov r4d, dword wm
 %ifidn %2, highbd
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -554,21 +554,21 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);

 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
 FUN_CONV_2D(, avx2);
 #endif  // HAVE_AX2 && HAVE_SSSE3
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -306,29 +306,28 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;

 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);

 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                      out2, out3, out4, out5, out6, out7)                 \
@@ -813,9 +812,9 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,

 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
+                             const InterpKernel *const filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -840,49 +839,43 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,

  if (w >= 8) {
    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
  } else {
    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
  }

  if (w >= 16) {
    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
  } else if (w == 8) {
    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
  } else {
    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
  }
 }

 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
+  scaledconvolve2d(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                   y0_q4, y_step_q4, w, h);
 }

 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
 FUN_CONV_2D(, ssse3);
 FUN_CONV_2D(avg_, ssse3);