Merge changes Ic3a68557,Ib1dbe41a,I0da09270,Ibdbd720d into nextgenv2

* changes: Deringing cleanup: remove DERING_REFINEMENT (always on now) Don't run the deringing filter on skipped blocks within a superblock Don't dering skipped superblocks On x86 use _mm_set_epi32 when _mm_cvtsi64_si128 isn't available
2016-10-13 15:54:32 +00:00
parent 89d3f2fd10 e874ce0300
commit 8ac419f307
8 changed files with 32 additions and 20 deletions
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -162,7 +162,11 @@ SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {

 SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
 #if defined(__SSSE3__)
+#ifdef __x86_64__
  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
                            _mm_shuffle_epi8(a, order));
 #else
@@ -176,7 +180,11 @@ SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {

 SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
 #if defined(__SSSE3__)
+#ifdef __x86_64__
  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
                            _mm_shuffle_epi8(a, order));
 #else
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -47,7 +47,11 @@ SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
 }

 SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+  return _mm_cvtsi64_si128(x);
+#else
  return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
+#endif
 }

 SIMD_INLINE uint64_t v64_u64(v64 x) {
@@ -168,7 +172,7 @@ SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
 SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
 #if defined(__SSSE3__)
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0f0d0b0907050301LL));
+                          v64_from_64(0x0f0d0b0907050301LL));
 #else
  return _mm_packus_epi16(
      _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
@@ -179,7 +183,7 @@ SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
 SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
 #if defined(__SSSE3__)
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0e0c0a0806040200LL));
+                          v64_from_64(0x0e0c0a0806040200LL));
 #else
  return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
 #endif
@@ -188,7 +192,7 @@ SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
 SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
 #if defined(__SSSE3__)
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0f0e0b0a07060302LL));
+                          v64_from_64(0x0f0e0b0a07060302LL));
 #else
  return _mm_packs_epi32(
      _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
@@ -199,7 +203,7 @@ SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
 SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
 #if defined(__SSSE3__)
  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0d0c090805040100LL));
+                          v64_from_64(0x0d0c090805040100LL));
 #else
  return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
 #endif
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -101,19 +101,15 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
      for (pli = 0; pli < 3; pli++) {
        int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
        int threshold;
-#if DERING_REFINEMENT
        level = compute_level_from_index(
            global_level,
            cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
                                MAX_MIB_SIZE * sbc]
                ->mbmi.dering_gain);
-#else
-          level = global_level;
-#endif
        /* FIXME: This is a temporary hack that uses more conservative
           deringing for chroma. */
        if (pli) level = (level * 5 + 4) >> 3;
-        if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) level = 0;
+        if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
        threshold = level << coeff_shift;
        od_dering(&OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[pli],
                  &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
--- a/av1/common/dering.h
+++ b/av1/common/dering.h
@@ -24,7 +24,6 @@ extern "C" {
 #define DERING_LEVEL_BITS 6
 #define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)

-#define DERING_REFINEMENT 1
 #define DERING_REFINEMENT_BITS 2
 #define DERING_REFINEMENT_LEVELS 4

--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -275,6 +275,13 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
      in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
    }
  }
+  /* Assume deringing filter is sparsely applied, so do one large copy rather
+     than small copies later if deringing is skipped. */
+  for (i = 0; i < nvb << bsize; i++) {
+    for (j = 0; j < nhb << bsize; j++) {
+      y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
+    }
+  }
  if (pli == 0) {
    for (by = 0; by < nvb; by++) {
      for (bx = 0; bx < nhb; bx++) {
@@ -325,6 +332,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
  }
  for (by = 0; by < nvb; by++) {
    for (bx = 0; bx < nhb; bx++) {
+      if (thresh[by][bx] == 0) continue;
      (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh[by][bx],
@@ -338,6 +346,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
  }
  for (by = 0; by < nvb; by++) {
    for (bx = 0; bx < nhb; bx++) {
+      if (thresh[by][bx] == 0) continue;
      (vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1772,7 +1772,7 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
  if (bsize >= BLOCK_8X8 &&
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
    dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
-#if DERING_REFINEMENT
+#if CONFIG_DERING
  if (bsize == BLOCK_64X64) {
    if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
@@ -1782,7 +1782,7 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
          0;
    }
  }
-#endif  // DERGING_REFINEMENT
+#endif
 #endif  // CONFIG_EXT_PARTITION_TYPES
 }

--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1869,7 +1869,7 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
    update_partition_context(xd, mi_row, mi_col, subsize, bsize);

-#if DERING_REFINEMENT
+#if CONFIG_DERING
  if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
      !sb_all_skip(cm, mi_row, mi_col)) {
    aom_write_literal(
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -96,6 +96,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
      for (level = 0; level < 64; level++) {
        int cur_mse;
        int threshold;
@@ -117,7 +118,6 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      }
    }
  }
-#if DERING_REFINEMENT
  best_level = 0;
  /* Search for the best global level one value at a time. */
  for (global_level = 2; global_level < MAX_DERING_LEVEL; global_level++) {
@@ -126,6 +126,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      for (sbc = 0; sbc < nhsb; sbc++) {
        int gi;
        int best_mse = mse[nhsb * sbr + sbc][0];
+        if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
        for (gi = 1; gi < 4; gi++) {
          level = compute_level_from_index(global_level, gi);
          if (mse[nhsb * sbr + sbc][level] < best_mse) {
@@ -145,6 +146,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
      int gi;
      int best_gi;
      int best_mse = mse[nhsb * sbr + sbc][0];
+      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
      best_gi = 0;
      for (gi = 1; gi < DERING_REFINEMENT_LEVELS; gi++) {
        level = compute_level_from_index(best_level, gi);
@@ -158,12 +160,6 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
          ->mbmi.dering_gain = best_gi;
    }
  }
-#else
-    best_level = 0;
-    for (level = 0; level < MAX_DERING_LEVEL; level++) {
-      if (tot_mse[level] < tot_mse[best_level]) best_level = level;
-    }
-#endif
  aom_free(src);
  aom_free(ref_coeff);
  aom_free(bskip);