Merge branch 'master' into nextgenv2

Change-Id: I6f8b540854ddc78fc4a2a8045b194a888749a3cb
2015-12-09 08:06:25 -08:00
parent 9fbc394036 d9bba21306
commit f757782f22
17 changed files with 600 additions and 66 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -1,5 +1,4 @@
 Adrian Grange <agrange@google.com>
-Adrian Grange <agrange@google.com> <agrange@agrange-macbookpro.roam.corp.google.com>
 Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
@@ -8,13 +7,11 @@ Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
 Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
-Hangyu Kuang <hkuang@google.com> <hkuang@hkuang-macbookpro.roam.corp.google.com>
 Hui Su <huisu@google.com>
 Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
@@ -33,4 +30,3 @@ Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.co
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
-Yaowu Xu <yaowu@google.com> <yaowu@YAOWU2-W.ad.corp.google.com>
--- a/test/test.mk
+++ b/test/test.mk
@@ -92,10 +92,9 @@ endif
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc

 ## VP8
-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)

 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
@@ -105,10 +104,10 @@ endif

 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc

 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
@@ -121,7 +120,7 @@ endif
 endif # VP8

 ## VP9
-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)

 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
@@ -134,25 +133,25 @@ LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
 endif

-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
+LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
+LIBVPX_TEST_SRCS-yes                   += lpf_8_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc

 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
-
 endif

 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
@@ -162,20 +161,25 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc

 endif # VP9

-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
-
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
-
 ## VP10
+ifeq ($(CONFIG_VP10),yes)
+
+LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
+
+endif # VP10
+
+## Multi-codec / unconditional whitebox tests.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_txfm_test.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm1d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm2d_test.cc

+TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
+
 endif # CONFIG_SHARED

 include $(SRC_PATH_BARE)/test/test-data.mk
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -187,15 +187,10 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
                vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)

-#if HAVE_SSE && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
-                vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
-                vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE && CONFIG_USE_X86INC
-
 #if HAVE_SSE2 && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
                vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                vpx_tm_predictor_4x4_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -372,7 +372,10 @@ INSTANTIATE_TEST_CASE_P(
    ::testing::Values(
        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon),
+        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon),
+        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon),
+        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon)));

 INSTANTIATE_TEST_CASE_P(
    NEON, IntProRowTest, ::testing::Values(
--- a/tools/gen_authors.sh
+++ b/tools/gen_authors.sh
@@ -6,7 +6,7 @@ cat <<EOF
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.

-$(git log --pretty=format:"%aN <%aE>" | sort | uniq)
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
--- a/vp8/common/vp8_loopfilter.c
+++ b/vp8/common/vp8_loopfilter.c
@@ -141,8 +141,8 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
            else  /* Delta Value */
            {
                lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
-                lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
            }
+            lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
        }

        if (!mbd->mode_ref_lf_delta_enabled)
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -73,10 +73,9 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)

        /* Delta Value */
        else
-        {
            QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
-            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
-        }
+
+        QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
    }
    else
        QIndex = pc->base_qindex;
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -174,6 +174,9 @@ void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
  else if (eob <= 34)
    // non-zero coeff only in upper-left 8x8
    vpx_idct32x32_34_add(input, dest, stride);
+  else if (eob <= 135)
+    // non-zero coeff only in upper-left 16x16
+    vpx_idct32x32_135_add(input, dest, stride);
  else
    vpx_idct32x32_1024_add(input, dest, stride);
 }
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -198,7 +198,7 @@ add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2 neon msa/;

 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2 msa/;
+specialize qw/vp9_avg_4x4 sse2 neon msa/;

 add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
 specialize qw/vp9_minmax_8x8 sse2/;
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -24,6 +24,18 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
  return vget_lane_u32(c, 0);
 }

+unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
+  uint16x8_t v_sum;
+  uint32x2_t v_s0 = vdup_n_u32(0);
+  uint32x2_t v_s1 = vdup_n_u32(0);
+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
 unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
  uint8x8_t v_s0 = vld1_u8(s);
  const uint8x8_t v_s1 = vld1_u8(s + p);
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -194,7 +194,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                                                         int mi_col,
                                                         PICK_MODE_CONTEXT *ctx,
                                                         int motion_magnitude,
-                                                         int is_skin) {
+                                                         int is_skin,
+                                                         int *zeromv_filter) {
  int mv_col, mv_row;
  int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
  MV_REFERENCE_FRAME frame;
@@ -237,6 +238,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
    mbmi->mv[0].as_int = 0;
    ctx->best_sse_inter_mode = ZEROMV;
    ctx->best_sse_mv.as_int = 0;
+    *zeromv_filter = 1;
  }

  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
@@ -320,6 +322,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                          VP9_DENOISER_DECISION *denoiser_decision) {
  int mv_col, mv_row;
  int motion_magnitude = 0;
+  int zeromv_filter = 0;
  VP9_DENOISER_DECISION decision = COPY_BLOCK;
  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -360,7 +363,8 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                                           denoiser->increase_denoising,
                                           mi_row, mi_col, ctx,
                                           motion_magnitude,
-                                           is_skin);
+                                           is_skin,
+                                           &zeromv_filter);

  if (decision == FILTER_BLOCK) {
    decision = vp9_denoiser_filter(src.buf, src.stride,
@@ -382,6 +386,8 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                      num_4x4_blocks_high_lookup[bs] << 2);
  }
  *denoiser_decision = decision;
+  if (decision == FILTER_BLOCK && zeromv_filter == 1)
+    *denoiser_decision = FILTER_ZEROMV_BLOCK;
 }

 static void copy_frame(YV12_BUFFER_CONFIG * const dest,
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -23,7 +23,8 @@ extern "C" {

 typedef enum vp9_denoiser_decision {
  COPY_BLOCK,
-  FILTER_BLOCK
+  FILTER_BLOCK,
+  FILTER_ZEROMV_BLOCK
 } VP9_DENOISER_DECISION;

 typedef enum vp9_denoiser_level {
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1696,11 +1696,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    VP9_DENOISER_DECISION decision = COPY_BLOCK;
    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
                         VPXMAX(BLOCK_8X8, bsize), ctx, &decision);
-    // If INTRA mode was selected, re-evaluate ZEROMV on denoised result.
-    // Only do this under noise conditions, and if rdcost of ZEROMV on
-    // original source is not significantly higher than rdcost of INTRA MODE.
-    if (best_ref_frame == INTRA_FRAME &&
-        decision == FILTER_BLOCK &&
+    // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised
+    // result. Only do this under noise conditions, and if rdcost of ZEROMV on
+    // original source is not significantly higher than rdcost of best mode.
+    if (((best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+        (best_ref_frame == GOLDEN_FRAME && decision == FILTER_ZEROMV_BLOCK)) &&
        cpi->noise_estimate.enabled &&
        cpi->noise_estimate.level > kLow &&
        zero_last_cost_orig < (best_rdc.rdcost << 3)) {
@@ -1721,13 +1721,21 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
      this_rdc.dist = dist;
      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
      // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
-      // is lower than INTRA (on original source).
+      // is lower than best_ref mode (on original source).
      if (this_rdc.rdcost > best_rdc.rdcost) {
        this_rdc = best_rdc;
        mbmi->mode = best_mode;
        mbmi->ref_frame[0] = best_ref_frame;
-        mbmi->mv[0].as_int = INVALID_MV;
        mbmi->interp_filter = best_pred_filter;
+        if (best_ref_frame == INTRA_FRAME)
+          mbmi->mv[0].as_int = INVALID_MV;
+        else if (best_ref_frame == GOLDEN_FRAME) {
+          mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+          if (reuse_inter_pred) {
+            xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+            vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+          }
+        }
        mbmi->tx_size = best_tx_size;
        x->skip_txfm[0] = best_mode_skip_txfm;
      } else {
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -1194,6 +1194,33 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
  }
 }

+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 16x16 has non-zero coeff
+  for (i = 0; i < 16; ++i) {
+    idct32_c(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                            int stride) {
  tran_low_t out[32 * 32] = {0};
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -91,7 +91,7 @@ add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";

 add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse_x86inc";
+specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse2_x86inc";

 add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_ve_predictor_4x4/;
@@ -100,16 +100,16 @@ add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, cons
 specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";

 add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse2_x86inc";

 add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse2_x86inc";

 add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse2_x86inc";

 add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse2_x86inc";

 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -754,6 +754,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_1024_add/;

+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_34_add/;

@@ -802,6 +805,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_1024_add sse2/;

+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2/;
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+
    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_34_add sse2/;

@@ -853,6 +860,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_1024_add/;

+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_34_add/;

@@ -892,6 +902,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";

+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    # Need to add 135 eob idct32x32 implementations.
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+    $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
+    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
+
    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
    # Need to add 34 eob idct32x32 neon implementation.
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -23,17 +23,18 @@ pw2_32:  times 8 dw 16

 SECTION .text

-INIT_MMX sse
-cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
  GET_GOT     goffsetq

-  pxor                  m1, m1
+  movd                  m2, [leftq]
  movd                  m0, [aboveq]
-  punpckldq             m0, [leftq]
+  pxor                  m1, m1
+  punpckldq             m0, m2
  psadbw                m0, m1
  paddw                 m0, [GLOBAL(pw_4)]
  psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
  packuswb              m0, m0
  movd      [dstq        ], m0
  movd      [dstq+strideq], m0
@@ -44,16 +45,17 @@ cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
  RESTORE_GOT
  RET

-INIT_MMX sse
-cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
  GET_GOT     goffsetq

+  movifnidn          leftq, leftmp
  pxor                  m1, m1
  movd                  m0, [leftq]
  psadbw                m0, m1
  paddw                 m0, [GLOBAL(pw2_4)]
  psraw                 m0, 2
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
  packuswb              m0, m0
  movd      [dstq        ], m0
  movd      [dstq+strideq], m0
@@ -64,8 +66,8 @@ cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
  RESTORE_GOT
  RET

-INIT_MMX sse
-cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
  GET_GOT     goffsetq

  pxor                  m1, m1
@@ -73,7 +75,7 @@ cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
  psadbw                m0, m1
  paddw                 m0, [GLOBAL(pw2_4)]
  psraw                 m0, 2
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
  packuswb              m0, m0
  movd      [dstq        ], m0
  movd      [dstq+strideq], m0
@@ -166,8 +168,8 @@ cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
  RESTORE_GOT
  RET

-INIT_MMX sse
-cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
  GET_GOT     goffsetq

  DEFINE_ARGS dst, stride, stride3
@@ -453,7 +455,7 @@ cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
  RESTORE_GOT
  RET

-INIT_MMX sse
+INIT_XMM sse2
 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
  movd                  m0, [aboveq]
  movd      [dstq        ], m0
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -18,9 +18,13 @@ SECTION_RODATA

 pw_11585x2: times 8 dw 23170

-pw_m2404x2: times 8 dw -2404*2
-pw_m4756x2: times 8 dw -4756*2
-pw_m5520x2: times 8 dw -5520*2
+pw_m2404x2:  times 8 dw  -2404*2
+pw_m4756x2:  times 8 dw  -4756*2
+pw_m5520x2:  times 8 dw  -5520*2
+pw_m8423x2:  times 8 dw  -8423*2
+pw_m9102x2:  times 8 dw  -9102*2
+pw_m10394x2: times 8 dw -10394*2
+pw_m11003x2: times 8 dw -11003*2

 pw_16364x2: times 8 dw 16364*2
 pw_16305x2: times 8 dw 16305*2
@@ -29,6 +33,18 @@ pw_16069x2: times 8 dw 16069*2
 pw_15893x2: times 8 dw 15893*2
 pw_15679x2: times 8 dw 15679*2
 pw_15426x2: times 8 dw 15426*2
+pw_15137x2: times 8 dw 15137*2
+pw_14811x2: times 8 dw 14811*2
+pw_14449x2: times 8 dw 14449*2
+pw_14053x2: times 8 dw 14053*2
+pw_13623x2: times 8 dw 13623*2
+pw_13160x2: times 8 dw 13160*2
+pw_12665x2: times 8 dw 12665*2
+pw_12140x2: times 8 dw 12140*2
+pw__9760x2: times 8 dw  9760*2
+pw__7723x2: times 8 dw  7723*2
+pw__7005x2: times 8 dw  7005*2
+pw__6270x2: times 8 dw  6270*2
 pw__3981x2: times 8 dw  3981*2
 pw__3196x2: times 8 dw  3196*2
 pw__1606x2: times 8 dw  1606*2
@@ -793,6 +809,450 @@ idct32x32_34_transpose_2:

  RET

+%macro IDCT32X32_135 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, m1
+  pmulhrsw             m1, [pw___804x2] ; stp1_16
+  pmulhrsw            m11, [pw_16364x2] ; stp2_31
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, m7
+  pmulhrsw             m7, [pw_15426x2] ; stp1_28
+  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, m3
+  pmulhrsw             m3, [pw__7005x2] ; stp1_18
+  pmulhrsw             m4, [pw_14811x2] ; stp2_29
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, m0
+  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
+  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
+  pmulhrsw             m2, [pw_16207x2] ; stp2_24
+
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, m5
+  pmulhrsw             m5, [pw__3981x2] ; stp1_20
+  pmulhrsw             m6, [pw_15893x2] ; stp2_27
+
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  mova                m13, m14
+  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
+  pmulhrsw            m14, [pw_14053x2] ; stp2_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__9760x2] ; stp1_22
+  pmulhrsw             m1, [pw_13160x2] ; stp2_25
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__1606x2] ; stp1_8
+  pmulhrsw             m1, [pw_16305x2] ; stp2_15
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, m6
+  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
+  pmulhrsw             m6, [pw_15679x2] ; stp1_12
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, m4
+  pmulhrsw             m4, [pw__7723x2] ; stp1_10
+  pmulhrsw             m5, [pw_14449x2] ; stp2_13
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
+  pmulhrsw             m2, [pw_12665x2] ; stp2_14
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, m11
+  pmulhrsw            m11, [pw__3196x2] ; stp1_4
+  pmulhrsw            m12, [pw_16069x2] ; stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, m13
+  pmulhrsw            m13, [pw_13623x2] ; stp1_6
+  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
+  mova                 m3, m2
+  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
+  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
+
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  mova                 m1, m0    ; stp1_0 = stp1_1
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 2
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_135:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 2
+
+idct32x32_135_transpose:
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose
+
+  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
+  lea            stp, [stp + 16 * 8]
+  lea         inputq, [inputq + 16 * 32]
+  dec             r6
+  jnz idct32x32_135
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_135_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 2
+
+idct32x32_135_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose_2
+
+  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_135_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
+
 %macro IDCT32X32_1024 4
  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  mova                 m1, [rsp + transposed_in + 16 *  1]