Merge branch 'master' into nextgenv2

2016-01-30 05:00:05 -08:00
parent af99a61697 49bf2e2ffe
commit 8dc6f3f5c2
25 changed files with 382 additions and 164 deletions
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -647,7 +647,7 @@ process_common_toolchain() {
      armv6*)
        tgt_isa=armv6
        ;;
-      armv7*-hardfloat*)
+      armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf)
        tgt_isa=armv7
        float_abi=hard
        ;;
@@ -877,7 +877,6 @@ process_common_toolchain() {
      case ${tgt_cc} in
        gcc)
          CROSS=${CROSS:-arm-none-linux-gnueabi-}
          link_with_cc=gcc
          setup_gnu_toolchain
          arch_int=${tgt_isa##armv}
@@ -1135,7 +1134,7 @@ EOF
          CC=${CC:-${CROSS}gcc}
          CXX=${CXX:-${CROSS}g++}
          LD=${LD:-${CROSS}gcc}
-          CROSS=${CROSS:-g}
+          CROSS=${CROSS-g}
          ;;
        os2)
          disable_feature pic
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -0,0 +1,68 @@
 /*
 *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 namespace {
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 TEST(EncodeAPI, InvalidParams) {
  static const vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_ENCODER
    &vpx_codec_vp8_cx_algo,
 #endif
 #if CONFIG_VP9_ENCODER
    &vpx_codec_vp9_cx_algo,
 #endif
 #if CONFIG_VP10_ENCODER
    &vpx_codec_vp10_cx_algo,
 #endif
  };
  uint8_t buf[1] = {0};
  vpx_image_t img;
  vpx_codec_ctx_t enc;
  vpx_codec_enc_cfg_t cfg;
  EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf));
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(NULL, NULL, NULL, 0));
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(&enc, NULL, NULL, 0));
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, NULL, 0, 0, 0, 0));
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, &img, 0, 0, 0, 0));
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
            vpx_codec_enc_config_default(NULL, NULL, 0));
  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
            vpx_codec_enc_config_default(NULL, &cfg, 0));
  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
    SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i]));
    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
              vpx_codec_enc_init(NULL, kCodecs[i], NULL, 0));
    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
              vpx_codec_enc_init(&enc, kCodecs[i], NULL, 0));
    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
              vpx_codec_enc_config_default(kCodecs[i], &cfg, 1));
    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0));
    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, NULL, 0, 0, 0, 0));
    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
  }
 }
 }  // namespace
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -562,6 +562,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -550,6 +550,8 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f *vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29 *vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce *vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7 *vp90-2-03-size-226x226.webm.md5
 52bc1dfd3a97b24d922eb8a31d07527891561f2a *vp90-2-03-size-352x288.webm
 3084d6d0a1eec22e85a394422fbc8faae58930a5 *vp90-2-03-size-352x288.webm.md5
 b6524e4084d15b5d0caaa3d3d1368db30cbee69c *vp90-2-03-deltaq.webm
 65f45ec9a55537aac76104818278e0978f94a678 *vp90-2-03-deltaq.webm.md5
 4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba *vp90-2-05-resize.ivf
--- a/test/test.mk
+++ b/test/test.mk
@@ -20,6 +20,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -180,7 +180,8 @@ const char *const kVP9TestVectors[] = {
  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-size-352x288.webm",
  "vp90-2-03-deltaq.webm",
  "vp90-2-05-resize.ivf", "vp90-2-06-bilinear.webm",
  "vp90-2-07-frame_parallel.webm", "vp90-2-08-tile_1x2_frame_parallel.webm",
  "vp90-2-08-tile_1x2.webm", "vp90-2-08-tile_1x4_frame_parallel.webm",
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@@ -1629,7 +1629,6 @@ static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
  int mid_boost_bits = 0;
  int mid_frame_idx;
  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
  int alt_frame_index = frame_index;
  key_frame = cpi->common.frame_type == KEY_FRAME;
@@ -1642,15 +1641,13 @@ static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
      gf_group->update_type[0] = OVERLAY_UPDATE;
      gf_group->rf_level[0] = INTER_NORMAL;
      gf_group->bit_allocation[0] = 0;
      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
    } else {
      gf_group->update_type[0] = GF_UPDATE;
      gf_group->rf_level[0] = GF_ARF_STD;
      gf_group->bit_allocation[0] = gf_arf_bits;
    }
    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
    }
    // Step over the golden frame / overlay frame
    if (EOF == input_stats(twopass, &frame_stats))
@@ -1664,15 +1661,15 @@ static void allocate_gf_group_bits(VP10_COMP *cpi, int64_t gf_group_bits,
  // Store the bits to spend on the ARF if there is one.
  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[alt_frame_index] = ARF_UPDATE;
+    gf_group->update_type[frame_index] = ARF_UPDATE;
-    gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
-    gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
-    gf_group->arf_src_offset[alt_frame_index] =
+    gf_group->arf_src_offset[frame_index] =
        (unsigned char)(rc->baseline_gf_interval - 1);
-    gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[alt_frame_index] =
+    gf_group->arf_ref_idx[frame_index] =
      arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                         rc->source_alt_ref_active];
    ++frame_index;
--- a/vp10/encoder/lookahead.c
+++ b/vp10/encoder/lookahead.c
@@ -207,7 +207,7 @@ struct lookahead_entry *vp10_lookahead_pop(struct lookahead_ctx *ctx,
                                          int drain) {
  struct lookahead_entry *buf = NULL;
-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
    buf = pop(ctx, &ctx->read_idx);
    ctx->sz--;
  }
--- a/vp10/encoder/subexp.c
+++ b/vp10/encoder/subexp.c
@@ -142,8 +142,10 @@ int vp10_prob_diff_update_savings_search_model(const unsigned int *ct,
                                              vpx_prob *bestp,
                                              vpx_prob upd,
                                              int stepsize) {
-  int i, old_b, new_b, update_b, savings, bestsavings, step;
+  int i, old_b, new_b, update_b, savings, bestsavings;
  int newp;
  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
  const int step = stepsize * step_sign;
  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
  vp10_model_to_full_probs(oldp, oldplist);
  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
@@ -154,9 +156,10 @@ int vp10_prob_diff_update_savings_search_model(const unsigned int *ct,
  bestsavings = 0;
  bestnewp = oldp[PIVOT_NODE];
-  if (*bestp > oldp[PIVOT_NODE]) {
+  assert(stepsize > 0);
-    step = -stepsize;
+
-    for (newp = *bestp; newp > oldp[PIVOT_NODE]; newp += step) {
+  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0;
      newp += step) {
    if (newp < 1 || newp > 255)
      continue;
    newplist[PIVOT_NODE] = newp;
@@ -172,25 +175,6 @@ int vp10_prob_diff_update_savings_search_model(const unsigned int *ct,
      bestnewp = newp;
    }
  }
  } else {
    step = stepsize;
    for (newp = *bestp; newp < oldp[PIVOT_NODE]; newp += step) {
      if (newp < 1 || newp > 255)
        continue;
      newplist[PIVOT_NODE] = newp;
      vp10_model_to_full_probs(newplist, newplist);
      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
        new_b += cost_branch256(ct + 2 * i, newplist[i]);
      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
          vp10_cost_upd256;
      savings = old_b - new_b - update_b;
      if (savings > bestsavings) {
        bestsavings = savings;
        bestnewp = newp;
      }
    }
  }
  *bestp = bestnewp;
  return bestsavings;
--- a/vp8/common/reconintra4x4.h
+++ b/vp8/common/reconintra4x4.h
@@ -17,7 +17,7 @@
 extern "C" {
 #endif
-static void intra_prediction_down_copy(MACROBLOCKD *xd,
+static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
                                              unsigned char *above_right_src)
 {
    int dst_stride = xd->dst.y_stride;
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -181,6 +181,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
 {
    struct lookahead_entry* buf = NULL;
    assert(ctx != NULL);
    if(ctx->sz && (drain || ctx->sz == ctx->max_sz - 1))
    {
        buf = pop(ctx, &ctx->read_idx);
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -36,6 +36,8 @@
 extern unsigned int cnt_pm;
 #endif
 #define MODEL_MODE 0
 extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
@@ -45,18 +47,21 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 // skin color classifier is defined.
 // Fixed-point skin color model parameters.
-static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_mean[5][2] =
    {{7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
 static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
-static const int skin_threshold = 1570636;                    // q18
+static const int skin_threshold[2] = {1570636, 800000};       // q18
 // Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr)
+static int evaluate_skin_color_difference(int cb, int cr, int idx) {
 {
  const int cb_q6 = cb << 6;
  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cb_diff_q12 =
-  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cbcr_diff_q12 =
      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
  const int cr_diff_q12 =
      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
@@ -67,6 +72,34 @@ static int evaluate_skin_color_difference(int cb, int cr)
  return skin_diff;
 }
 // Checks if the input yCbCr values corresponds to skin color.
 static int is_skin_color(int y, int cb, int cr)
 {
  if (y < 40 || y > 220)
  {
    return 0;
  }
  else
  {
    if (MODEL_MODE == 0)
    {
      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
    }
    else
    {
      int i = 0;
      for (; i < 5; i++)
      {
        if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[1])
        {
          return 1;
        }
      }
      return 0;
    }
  }
 }
 static int macroblock_corner_grad(unsigned char* signal, int stride,
                                  int offsetx, int offsety, int sgnx, int sgny)
 {
@@ -157,16 +190,6 @@ static int check_dot_artifact_candidate(VP8_COMP *cpi,
  return 0;
 }
 // Checks if the input yCbCr values corresponds to skin color.
 static int is_skin_color(int y, int cb, int cr)
 {
  if (y < 40 || y > 220)
  {
    return 0;
  }
  return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
 }
 int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
                                int_mv *bestmv, int_mv *ref_mv,
                                int error_per_bit,
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1162,31 +1162,6 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
        return NULL;
 }
 static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
                                           va_list args)
 {
    int update = va_arg(args, int);
    vp8_update_entropy(ctx->cpi, update);
    return VPX_CODEC_OK;
 }
 static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
                                             va_list args)
 {
    int update = va_arg(args, int);
    vp8_update_reference(ctx->cpi, update);
    return VPX_CODEC_OK;
 }
 static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
                                          va_list args)
 {
    int reference_flag = va_arg(args, int);
    vp8_use_as_reference(ctx->cpi, reference_flag);
    return VPX_CODEC_OK;
 }
 static vpx_codec_err_t vp8e_set_frame_flags(vpx_codec_alg_priv_t *ctx,
                                            va_list args)
 {
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -137,6 +137,8 @@ struct macroblock {
  // the visual quality at the boundary of moving color objects.
  uint8_t color_sensitivity[2];
  uint8_t sb_is_skin;
  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -344,7 +344,9 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
  mv_col = ctx->best_sse_mv.as_mv.col;
  mv_row = ctx->best_sse_mv.as_mv.row;
  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
-  if (denoiser->denoising_level == kDenHigh && motion_magnitude < 16) {
+  if (!is_skin &&
      denoiser->denoising_level == kDenHigh &&
      motion_magnitude < 16) {
    denoiser->increase_denoising = 1;
  } else {
    denoiser->increase_denoising = 0;
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -714,6 +714,10 @@ static int choose_partitioning(VP9_COMP *cpi,
  s = x->plane[0].src.buf;
  sp = x->plane[0].src.stride;
  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
  // 5-20 for the 16x16 blocks.
  force_split[0] = 0;
  if (!is_key_frame) {
    // In the case of spatial/temporal scalable coding, the assumption here is
    // that the temporal reference frame will always be of type LAST_FRAME.
@@ -768,6 +772,49 @@ static int choose_partitioning(VP9_COMP *cpi,
    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
    // Check if most of the superblock is skin content, and if so, force split
    // to 32x32. Avoid checking superblocks on/near boundary for high resoln
    // Note superblock may still pick 64X64 if y_sad is very small
    // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
    x->sb_is_skin = 0;
 #if !CONFIG_VP9_HIGHBITDEPTH
    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && (low_res || (mi_col >= 8 &&
        mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows))) {
      int num_16x16_skin = 0;
      int num_16x16_nonskin = 0;
      uint8_t *ysignal = x->plane[0].src.buf;
      uint8_t *usignal = x->plane[1].src.buf;
      uint8_t *vsignal = x->plane[2].src.buf;
      int spuv = x->plane[1].src.stride;
      for (i = 0; i < 4; i++) {
        for (j = 0; j < 4; j++) {
          int is_skin = vp9_compute_skin_block(ysignal,
                                               usignal,
                                               vsignal,
                                               sp,
                                               spuv,
                                               BLOCK_16X16);
          num_16x16_skin += is_skin;
          num_16x16_nonskin += (1 - is_skin);
          if (num_16x16_nonskin > 3) {
            // Exit loop if at least 4 of the 16x16 blocks are not skin.
            i = 4;
            j = 4;
          }
          ysignal += 16;
          usignal += 8;
          vsignal += 8;
        }
        ysignal += (sp << 4) - 64;
        usignal += (spuv << 3) - 32;
        vsignal += (spuv << 3) - 32;
      }
      if (num_16x16_skin > 12) {
        x->sb_is_skin = 1;
        force_split[0] = 1;
      }
    }
 #endif
    for (i = 1; i <= 2; ++i) {
      struct macroblock_plane  *p = &x->plane[i];
      struct macroblockd_plane *pd = &xd->plane[i];
@@ -779,6 +826,8 @@ static int choose_partitioning(VP9_COMP *cpi,
        uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
                                     pd->dst.buf, pd->dst.stride);
        // TODO(marpan): Investigate if we should lower this threshold if
        // superblock is detected as skin.
        x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
    }
@@ -818,9 +867,6 @@ static int choose_partitioning(VP9_COMP *cpi,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  }
  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
  // 5-20 for the 16x16 blocks.
  force_split[0] = 0;
  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
  // for splits.
  for (i = 0; i < 4; i++) {
@@ -3629,6 +3675,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
    vp9_rd_cost_init(&dummy_rdc);
    x->color_sensitivity[0] = 0;
    x->color_sensitivity[1] = 0;
    x->sb_is_skin = 0;
    if (seg->enabled) {
      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1720,15 +1720,13 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
      gf_group->update_type[0] = OVERLAY_UPDATE;
      gf_group->rf_level[0] = INTER_NORMAL;
      gf_group->bit_allocation[0] = 0;
      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
    } else {
      gf_group->update_type[0] = GF_UPDATE;
      gf_group->rf_level[0] = GF_ARF_STD;
      gf_group->bit_allocation[0] = gf_arf_bits;
    }
    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
    }
    // Step over the golden frame / overlay frame
    if (EOF == input_stats(twopass, &frame_stats))
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -207,7 +207,7 @@ struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,
                                          int drain) {
  struct lookahead_entry *buf = NULL;
-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
    buf = pop(ctx, &ctx->read_idx);
    ctx->sz--;
  }
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -66,6 +66,7 @@ int enable_noise_estimation(VP9_COMP *const cpi) {
    return 0;
 }
 #if CONFIG_VP9_TEMPORAL_DENOISING
 static void copy_frame(YV12_BUFFER_CONFIG * const dest,
                       const YV12_BUFFER_CONFIG * const src) {
  int r;
@@ -81,6 +82,7 @@ static void copy_frame(YV12_BUFFER_CONFIG * const dest,
    srcbuf += src->y_stride;
  }
 }
 #endif  // CONFIG_VP9_TEMPORAL_DENOISING
 NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
  int noise_level = kLowLow;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -852,6 +852,12 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
  if (var <= thresh_ac && (sse - var) <= thresh_dc) {
    unsigned int sse_u, sse_v;
    unsigned int var_u, var_v;
    unsigned int thresh_ac_uv = thresh_ac;
    unsigned int thresh_dc_uv = thresh_dc;
    if (x->sb_is_skin) {
      thresh_ac_uv = 0;
      thresh_dc_uv = 0;
    }
    // Skip UV prediction unless breakout is zero (lossless) to save
    // computation with low impact on the result
@@ -867,14 +873,14 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
                                    xd->plane[1].dst.stride, &sse_u);
    // U skipping condition checking
-    if (((var_u << 2) <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+    if (((var_u << 2) <= thresh_ac_uv) && (sse_u - var_u <= thresh_dc_uv)) {
      var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
                                      x->plane[2].src.stride,
                                      xd->plane[2].dst.buf,
                                      xd->plane[2].dst.stride, &sse_v);
      // V skipping condition checking
-      if (((var_v << 2) <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+      if (((var_v << 2) <= thresh_ac_uv) && (sse_v - var_v <= thresh_dc_uv)) {
        x->skip = 1;
        // The cost of skip bit needs to be added.
@@ -1585,7 +1591,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
    if (cpi->oxcf.speed >= 5 &&
-        cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
        !x->sb_is_skin) {
      // Bias against non-zero (above some threshold) motion for large blocks.
      // This is temporary fix to avoid selection of large mv for big blocks.
      if (frame_mv[this_mode][ref_frame].as_mv.row > 64 ||
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -15,22 +15,28 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_skin_detection.h"
 #define MODEL_MODE 0
 // Fixed-point skin color model parameters.
-static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_mean[5][2] = {
    {7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
 static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
-static const int skin_threshold = 1570636;                    // q18
+static const int skin_threshold[2] = {1570636, 800000};       // q18
 // Thresholds on luminance.
 static const int y_low = 20;
 static const int y_high = 220;
 // Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr) {
+static int evaluate_skin_color_difference(int cb, int cr, int idx) {
  const int cb_q6 = cb << 6;
  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cb_diff_q12 =
-  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cbcr_diff_q12 =
      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
  const int cr_diff_q12 =
      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
@@ -42,10 +48,21 @@ static int evaluate_skin_color_difference(int cb, int cr) {
 }
 int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
-  if (y < y_low || y > y_high)
+  if (y < y_low || y > y_high) {
    return 0;
-  else
+  } else {
-    return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+    if (MODEL_MODE == 0) {
      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
    } else {
      int i = 0;
      for (; i < 5; i++) {
        if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[1]) {
          return 1;
        }
      }
      return 0;
    }
  }
 }
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -142,8 +142,10 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                              vpx_prob *bestp,
                                              vpx_prob upd,
                                              int stepsize) {
-  int i, old_b, new_b, update_b, savings, bestsavings, step;
+  int i, old_b, new_b, update_b, savings, bestsavings;
  int newp;
  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
  const int step = stepsize * step_sign;
  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
  vp9_model_to_full_probs(oldp, oldplist);
  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
@@ -154,9 +156,10 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
  bestsavings = 0;
  bestnewp = oldp[PIVOT_NODE];
-  if (*bestp > oldp[PIVOT_NODE]) {
+  assert(stepsize > 0);
-    step = -stepsize;
+
-    for (newp = *bestp; newp > oldp[PIVOT_NODE]; newp += step) {
+  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0;
      newp += step) {
    if (newp < 1 || newp > 255)
      continue;
    newplist[PIVOT_NODE] = newp;
@@ -172,25 +175,6 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
      bestnewp = newp;
    }
  }
  } else {
    step = stepsize;
    for (newp = *bestp; newp < oldp[PIVOT_NODE]; newp += step) {
      if (newp < 1 || newp > 255)
        continue;
      newplist[PIVOT_NODE] = newp;
      vp9_model_to_full_probs(newplist, newplist);
      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
        new_b += cost_branch256(ct + 2 * i, newplist[i]);
      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
          vp9_cost_upd256;
      savings = old_b - new_b - update_b;
      if (savings > bestsavings) {
        bestsavings = savings;
        bestnewp = newp;
      }
    }
  }
  *bestp = bestnewp;
  return bestsavings;
--- a/vpx_dsp/intrapred.c
+++ b/vpx_dsp/intrapred.c
@@ -44,6 +44,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
 #if CONFIG_MISC_FIXES
 static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
  int r, c;
@@ -58,6 +59,7 @@ static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
    dst += stride;
  }
 }
 #endif  // CONFIG_MISC_FIXES
 static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                 const uint8_t *above, const uint8_t *left) {
@@ -76,6 +78,7 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
  }
 }
 #if CONFIG_MISC_FIXES
 static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
  int r, c;
@@ -89,6 +92,7 @@ static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
    dst += stride;
  }
 }
 #endif  // CONFIG_MISC_FIXES
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                 const uint8_t *above, const uint8_t *left) {
@@ -109,6 +113,7 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
  }
 }
 #if CONFIG_MISC_FIXES
 static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
  int r, c;
@@ -121,6 +126,7 @@ static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
    dst += stride;
  }
 }
 #endif  // CONFIG_MISC_FIXES
 static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -785,10 +785,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vpx_idct4x4_1_add sse2/;
    add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2/;
+    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
    add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2/;
+    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
    add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct8x8_1_add sse2/;
@@ -803,14 +803,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vpx_idct16x16_1_add sse2/;
    add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2/;
+    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_135_add sse2/;
+    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
    # Need to add 135 eob idct32x32 implementations.
    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
    add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2/;
+    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
    add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
    specialize qw/vpx_idct32x32_1_add sse2/;
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -220,7 +220,24 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
  mova    m12, [pw_11585x2]
  lea      r3, [2 * strideq]
-
+%if CONFIG_VP9_HIGHBITDEPTH
  mova     m0, [inputq +   0]
  packssdw m0, [inputq +  16]
  mova     m1, [inputq +  32]
  packssdw m1, [inputq +  48]
  mova     m2, [inputq +  64]
  packssdw m2, [inputq +  80]
  mova     m3, [inputq +  96]
  packssdw m3, [inputq + 112]
  mova     m4, [inputq + 128]
  packssdw m4, [inputq + 144]
  mova     m5, [inputq + 160]
  packssdw m5, [inputq + 176]
  mova     m6, [inputq + 192]
  packssdw m6, [inputq + 208]
  mova     m7, [inputq + 224]
  packssdw m7, [inputq + 240]
 %else
  mova     m0, [inputq +   0]
  mova     m1, [inputq +  16]
  mova     m2, [inputq +  32]
@@ -229,7 +246,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
  mova     m5, [inputq +  80]
  mova     m6, [inputq +  96]
  mova     m7, [inputq + 112]
-
+%endif
  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
  IDCT8_1D
  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -254,10 +271,21 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
  lea        r3, [2 * strideq]
 %if CONFIG_VP9_HIGHBITDEPTH
  mova       m0, [inputq +   0]
  packssdw   m0, [inputq +  16]
  mova       m1, [inputq +  32]
  packssdw   m1, [inputq +  48]
  mova       m2, [inputq +  64]
  packssdw   m2, [inputq +  80]
  mova       m3, [inputq +  96]
  packssdw   m3, [inputq + 112]
 %else
  mova       m0, [inputq +  0]
  mova       m1, [inputq + 16]
  mova       m2, [inputq + 32]
  mova       m3, [inputq + 48]
 %endif
  punpcklwd  m0, m1
  punpcklwd  m2, m3
@@ -765,6 +793,24 @@ idct32x32_34:
  lea             r4, [rsp + transposed_in]
 idct32x32_34_transpose:
 %if CONFIG_VP9_HIGHBITDEPTH
  mova            m0, [r3 +       0]
  packssdw        m0, [r3 +      16]
  mova            m1, [r3 + 32 *  4]
  packssdw        m1, [r3 + 32 *  4 + 16]
  mova            m2, [r3 + 32 *  8]
  packssdw        m2, [r3 + 32 *  8 + 16]
  mova            m3, [r3 + 32 * 12]
  packssdw        m3, [r3 + 32 * 12 + 16]
  mova            m4, [r3 + 32 * 16]
  packssdw        m4, [r3 + 32 * 16 + 16]
  mova            m5, [r3 + 32 * 20]
  packssdw        m5, [r3 + 32 * 20 + 16]
  mova            m6, [r3 + 32 * 24]
  packssdw        m6, [r3 + 32 * 24 + 16]
  mova            m7, [r3 + 32 * 28]
  packssdw        m7, [r3 + 32 * 28 + 16]
 %else
  mova            m0, [r3 +       0]
  mova            m1, [r3 + 16 *  4]
  mova            m2, [r3 + 16 *  8]
@@ -773,6 +819,7 @@ idct32x32_34_transpose:
  mova            m5, [r3 + 16 * 20]
  mova            m6, [r3 + 16 * 24]
  mova            m7, [r3 + 16 * 28]
 %endif
  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -1176,6 +1223,24 @@ idct32x32_135:
  mov             r7, 2
 idct32x32_135_transpose:
 %if CONFIG_VP9_HIGHBITDEPTH
  mova            m0, [r3 +       0]
  packssdw        m0, [r3 +      16]
  mova            m1, [r3 + 32 *  4]
  packssdw        m1, [r3 + 32 *  4 + 16]
  mova            m2, [r3 + 32 *  8]
  packssdw        m2, [r3 + 32 *  8 + 16]
  mova            m3, [r3 + 32 * 12]
  packssdw        m3, [r3 + 32 * 12 + 16]
  mova            m4, [r3 + 32 * 16]
  packssdw        m4, [r3 + 32 * 16 + 16]
  mova            m5, [r3 + 32 * 20]
  packssdw        m5, [r3 + 32 * 20 + 16]
  mova            m6, [r3 + 32 * 24]
  packssdw        m6, [r3 + 32 * 24 + 16]
  mova            m7, [r3 + 32 * 28]
  packssdw        m7, [r3 + 32 * 28 + 16]
 %else
  mova            m0, [r3 +       0]
  mova            m1, [r3 + 16 *  4]
  mova            m2, [r3 + 16 *  8]
@@ -1184,7 +1249,7 @@ idct32x32_135_transpose:
  mova            m5, [r3 + 16 * 20]
  mova            m6, [r3 + 16 * 24]
  mova            m7, [r3 + 16 * 28]
-
+%endif
  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
  mova [r4 +      0], m0
@@ -1196,14 +1261,22 @@ idct32x32_135_transpose:
  mova [r4 + 16 * 6], m6
  mova [r4 + 16 * 7], m7
 %if CONFIG_VP9_HIGHBITDEPTH
  add             r3, 32
 %else
  add             r3, 16
 %endif
  add             r4, 16 * 8
  dec             r7
  jne idct32x32_135_transpose
  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
  lea            stp, [stp + 16 * 8]
 %if CONFIG_VP9_HIGHBITDEPTH
  lea         inputq, [inputq + 32 * 32]
 %else
  lea         inputq, [inputq + 16 * 32]
 %endif
  dec             r6
  jnz idct32x32_135
@@ -1614,6 +1687,24 @@ idct32x32_1024:
  mov             r7, 4
 idct32x32_1024_transpose:
 %if CONFIG_VP9_HIGHBITDEPTH
  mova            m0, [r3 +       0]
  packssdw        m0, [r3 +      16]
  mova            m1, [r3 + 32 *  4]
  packssdw        m1, [r3 + 32 *  4 + 16]
  mova            m2, [r3 + 32 *  8]
  packssdw        m2, [r3 + 32 *  8 + 16]
  mova            m3, [r3 + 32 * 12]
  packssdw        m3, [r3 + 32 * 12 + 16]
  mova            m4, [r3 + 32 * 16]
  packssdw        m4, [r3 + 32 * 16 + 16]
  mova            m5, [r3 + 32 * 20]
  packssdw        m5, [r3 + 32 * 20 + 16]
  mova            m6, [r3 + 32 * 24]
  packssdw        m6, [r3 + 32 * 24 + 16]
  mova            m7, [r3 + 32 * 28]
  packssdw        m7, [r3 + 32 * 28 + 16]
 %else
  mova            m0, [r3 +       0]
  mova            m1, [r3 + 16 *  4]
  mova            m2, [r3 + 16 *  8]
@@ -1622,6 +1713,7 @@ idct32x32_1024_transpose:
  mova            m5, [r3 + 16 * 20]
  mova            m6, [r3 + 16 * 24]
  mova            m7, [r3 + 16 * 28]
 %endif
  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -1633,8 +1725,11 @@ idct32x32_1024_transpose:
  mova [r4 + 16 * 5], m5
  mova [r4 + 16 * 6], m6
  mova [r4 + 16 * 7], m7
-
+%if CONFIG_VP9_HIGHBITDEPTH
  add             r3, 32
 %else
  add             r3, 16
 %endif
  add             r4, 16 * 8
  dec             r7
  jne idct32x32_1024_transpose
@@ -1642,7 +1737,11 @@ idct32x32_1024_transpose:
  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
  lea            stp, [stp + 16 * 8]
 %if CONFIG_VP9_HIGHBITDEPTH
  lea         inputq, [inputq + 32 * 32]
 %else
  lea         inputq, [inputq + 16 * 32]
 %endif
  dec             r6
  jnz idct32x32_1024