Merge changes Ic3a68557,Ib1dbe41a,I0da09270,Ibdbd720d into nextgenv2
* changes: Deringing cleanup: remove DERING_REFINEMENT (always on now) Don't run the deringing filter on skipped blocks within a superblock Don't dering skipped superblocks On x86 use _mm_set_epi32 when _mm_cvtsi64_si128 isn't available
This commit is contained in:
@@ -162,7 +162,11 @@ SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
|
||||
#if defined(__SSSE3__)
|
||||
#ifdef __x86_64__
|
||||
v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
|
||||
#else
|
||||
v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
|
||||
#endif
|
||||
return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
|
||||
_mm_shuffle_epi8(a, order));
|
||||
#else
|
||||
@@ -176,7 +180,11 @@ SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
|
||||
|
||||
SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
|
||||
#if defined(__SSSE3__)
|
||||
#ifdef __x86_64__
|
||||
v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
|
||||
#else
|
||||
v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
|
||||
#endif
|
||||
return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
|
||||
_mm_shuffle_epi8(a, order));
|
||||
#else
|
||||
|
@@ -47,7 +47,11 @@ SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
|
||||
}
|
||||
|
||||
SIMD_INLINE v64 v64_from_64(uint64_t x) {
|
||||
#ifdef __x86_64__
|
||||
return _mm_cvtsi64_si128(x);
|
||||
#else
|
||||
return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
|
||||
#endif
|
||||
}
|
||||
|
||||
SIMD_INLINE uint64_t v64_u64(v64 x) {
|
||||
@@ -168,7 +172,7 @@ SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
|
||||
SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
_mm_cvtsi64_si128(0x0f0d0b0907050301LL));
|
||||
v64_from_64(0x0f0d0b0907050301LL));
|
||||
#else
|
||||
return _mm_packus_epi16(
|
||||
_mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
|
||||
@@ -179,7 +183,7 @@ SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
|
||||
SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
_mm_cvtsi64_si128(0x0e0c0a0806040200LL));
|
||||
v64_from_64(0x0e0c0a0806040200LL));
|
||||
#else
|
||||
return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
|
||||
#endif
|
||||
@@ -188,7 +192,7 @@ SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
|
||||
SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
_mm_cvtsi64_si128(0x0f0e0b0a07060302LL));
|
||||
v64_from_64(0x0f0e0b0a07060302LL));
|
||||
#else
|
||||
return _mm_packs_epi32(
|
||||
_mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
|
||||
@@ -199,7 +203,7 @@ SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
|
||||
SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
|
||||
#if defined(__SSSE3__)
|
||||
return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
|
||||
_mm_cvtsi64_si128(0x0d0c090805040100LL));
|
||||
v64_from_64(0x0d0c090805040100LL));
|
||||
#else
|
||||
return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
|
||||
#endif
|
||||
|
@@ -101,19 +101,15 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
|
||||
for (pli = 0; pli < 3; pli++) {
|
||||
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
|
||||
int threshold;
|
||||
#if DERING_REFINEMENT
|
||||
level = compute_level_from_index(
|
||||
global_level,
|
||||
cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
|
||||
MAX_MIB_SIZE * sbc]
|
||||
->mbmi.dering_gain);
|
||||
#else
|
||||
level = global_level;
|
||||
#endif
|
||||
/* FIXME: This is a temporary hack that uses more conservative
|
||||
deringing for chroma. */
|
||||
if (pli) level = (level * 5 + 4) >> 3;
|
||||
if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) level = 0;
|
||||
if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
|
||||
threshold = level << coeff_shift;
|
||||
od_dering(&OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[pli],
|
||||
&src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
|
||||
|
@@ -24,7 +24,6 @@ extern "C" {
|
||||
#define DERING_LEVEL_BITS 6
|
||||
#define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)
|
||||
|
||||
#define DERING_REFINEMENT 1
|
||||
#define DERING_REFINEMENT_BITS 2
|
||||
#define DERING_REFINEMENT_LEVELS 4
|
||||
|
||||
|
@@ -275,6 +275,13 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
|
||||
in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
|
||||
}
|
||||
}
|
||||
/* Assume deringing filter is sparsely applied, so do one large copy rather
|
||||
than small copies later if deringing is skipped. */
|
||||
for (i = 0; i < nvb << bsize; i++) {
|
||||
for (j = 0; j < nhb << bsize; j++) {
|
||||
y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
|
||||
}
|
||||
}
|
||||
if (pli == 0) {
|
||||
for (by = 0; by < nvb; by++) {
|
||||
for (bx = 0; bx < nhb; bx++) {
|
||||
@@ -325,6 +332,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
|
||||
}
|
||||
for (by = 0; by < nvb; by++) {
|
||||
for (bx = 0; bx < nhb; bx++) {
|
||||
if (thresh[by][bx] == 0) continue;
|
||||
(vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
|
||||
&y[(by * ystride << bsize) + (bx << bsize)], ystride,
|
||||
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh[by][bx],
|
||||
@@ -338,6 +346,7 @@ void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
|
||||
}
|
||||
for (by = 0; by < nvb; by++) {
|
||||
for (bx = 0; bx < nhb; bx++) {
|
||||
if (thresh[by][bx] == 0) continue;
|
||||
(vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
|
||||
&y[(by * ystride << bsize) + (bx << bsize)], ystride,
|
||||
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
|
||||
|
@@ -1772,7 +1772,7 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
|
||||
if (bsize >= BLOCK_8X8 &&
|
||||
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
|
||||
dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
|
||||
#if DERING_REFINEMENT
|
||||
#if CONFIG_DERING
|
||||
if (bsize == BLOCK_64X64) {
|
||||
if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
|
||||
cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
|
||||
@@ -1782,7 +1782,7 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
|
||||
0;
|
||||
}
|
||||
}
|
||||
#endif // DERGING_REFINEMENT
|
||||
#endif
|
||||
#endif // CONFIG_EXT_PARTITION_TYPES
|
||||
}
|
||||
|
||||
|
@@ -1869,7 +1869,7 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
|
||||
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
|
||||
update_partition_context(xd, mi_row, mi_col, subsize, bsize);
|
||||
|
||||
#if DERING_REFINEMENT
|
||||
#if CONFIG_DERING
|
||||
if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
|
||||
!sb_all_skip(cm, mi_row, mi_col)) {
|
||||
aom_write_literal(
|
||||
|
@@ -96,6 +96,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
|
||||
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
|
||||
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
|
||||
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
|
||||
if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
|
||||
for (level = 0; level < 64; level++) {
|
||||
int cur_mse;
|
||||
int threshold;
|
||||
@@ -117,7 +118,6 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
|
||||
}
|
||||
}
|
||||
}
|
||||
#if DERING_REFINEMENT
|
||||
best_level = 0;
|
||||
/* Search for the best global level one value at a time. */
|
||||
for (global_level = 2; global_level < MAX_DERING_LEVEL; global_level++) {
|
||||
@@ -126,6 +126,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
|
||||
for (sbc = 0; sbc < nhsb; sbc++) {
|
||||
int gi;
|
||||
int best_mse = mse[nhsb * sbr + sbc][0];
|
||||
if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
|
||||
for (gi = 1; gi < 4; gi++) {
|
||||
level = compute_level_from_index(global_level, gi);
|
||||
if (mse[nhsb * sbr + sbc][level] < best_mse) {
|
||||
@@ -145,6 +146,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
|
||||
int gi;
|
||||
int best_gi;
|
||||
int best_mse = mse[nhsb * sbr + sbc][0];
|
||||
if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
|
||||
best_gi = 0;
|
||||
for (gi = 1; gi < DERING_REFINEMENT_LEVELS; gi++) {
|
||||
level = compute_level_from_index(best_level, gi);
|
||||
@@ -158,12 +160,6 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
|
||||
->mbmi.dering_gain = best_gi;
|
||||
}
|
||||
}
|
||||
#else
|
||||
best_level = 0;
|
||||
for (level = 0; level < MAX_DERING_LEVEL; level++) {
|
||||
if (tot_mse[level] < tot_mse[best_level]) best_level = level;
|
||||
}
|
||||
#endif
|
||||
aom_free(src);
|
||||
aom_free(ref_coeff);
|
||||
aom_free(bskip);
|
||||
|
Reference in New Issue
Block a user