Merge "De-sparsifying the deringing output buffer" into nextgenv2

This commit is contained in:
Yaowu Xu
2016-11-02 22:35:04 +00:00
committed by Gerrit Code Review
4 changed files with 20 additions and 17 deletions

View File

@@ -90,7 +90,7 @@ static INLINE void copy_4x4_16_8bit(uint8_t *dst, int dstride, int16_t *src, int
} }
/* TODO: Optimize this function for SSE. */ /* TODO: Optimize this function for SSE. */
void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride, void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src,
unsigned char (*bskip)[2], int dering_count, int bsize) unsigned char (*bskip)[2], int dering_count, int bsize)
{ {
int bi, bx, by; int bi, bx, by;
@@ -100,7 +100,7 @@ void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
bx = bskip[bi][1]; bx = bskip[bi][1];
copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)], copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)],
dstride, dstride,
&src[(by << 3) * sstride + (bx << 3)], sstride); &src[bi << 2*bsize], 1 << bsize);
} }
} else { } else {
for (bi = 0; bi < dering_count; bi++) { for (bi = 0; bi < dering_count; bi++) {
@@ -108,7 +108,7 @@ void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
bx = bskip[bi][1]; bx = bskip[bi][1];
copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)], copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)],
dstride, dstride,
&src[(by << 2) * sstride + (bx << 2)], sstride); &src[bi << 2*bsize], 1 << bsize);
} }
} }
} }
@@ -182,7 +182,7 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
else else
threshold = level << coeff_shift; threshold = level << coeff_shift;
if (threshold == 0) continue; if (threshold == 0) continue;
od_dering(dst, MAX_MIB_SIZE * bsize[pli], od_dering(dst,
&src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE + &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
sbc * bsize[pli] * MAX_MIB_SIZE], sbc * bsize[pli] * MAX_MIB_SIZE],
stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli, stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
@@ -194,7 +194,7 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride * xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride *
(bsize[pli] * MAX_MIB_SIZE * sbr) + (bsize[pli] * MAX_MIB_SIZE * sbr) +
sbc * bsize[pli] * MAX_MIB_SIZE], sbc * bsize[pli] * MAX_MIB_SIZE],
xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip, xd->plane[pli].dst.stride, dst, bskip,
dering_count, 3 - dec[pli]); dering_count, 3 - dec[pli]);
} else { } else {
#endif #endif
@@ -202,7 +202,7 @@ void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
&xd->plane[pli].dst.buf[xd->plane[pli].dst.stride * &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
(bsize[pli] * MAX_MIB_SIZE * sbr) + (bsize[pli] * MAX_MIB_SIZE * sbr) +
sbc * bsize[pli] * MAX_MIB_SIZE], sbc * bsize[pli] * MAX_MIB_SIZE],
xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip, xd->plane[pli].dst.stride, dst, bskip,
dering_count, 3 - dec[pli]); dering_count, 3 - dec[pli]);
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
} }

View File

@@ -277,7 +277,7 @@ static INLINE void copy_4x4_16bit(int16_t *dst, int dstride, int16_t *src, int s
} }
/* TODO: Optimize this function for SSE. */ /* TODO: Optimize this function for SSE. */
void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride, void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
unsigned char (*bskip)[2], int dering_count, int bsize) unsigned char (*bskip)[2], int dering_count, int bsize)
{ {
int bi, bx, by; int bi, bx, by;
@@ -287,7 +287,7 @@ void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
bx = bskip[bi][1]; bx = bskip[bi][1];
copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)],
dstride, dstride,
&src[(by << 3) * sstride + (bx << 3)], sstride); &src[bi << 2*bsize], 1 << bsize);
} }
} else { } else {
for (bi = 0; bi < dering_count; bi++) { for (bi = 0; bi < dering_count; bi++) {
@@ -295,12 +295,12 @@ void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
bx = bskip[bi][1]; bx = bskip[bi][1];
copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)],
dstride, dstride,
&src[(by << 2) * sstride + (bx << 2)], sstride); &src[bi << 2*bsize], 1 << bsize);
} }
} }
} }
void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride, void od_dering(int16_t *y, const od_dering_in *x, int xstride,
int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec, int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char (*bskip)[2], int dering_count, int threshold, unsigned char (*bskip)[2], int dering_count, int threshold,
@@ -349,7 +349,7 @@ void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
since the ringing there tends to be directional, so it doesn't since the ringing there tends to be directional, so it doesn't
get removed by the directional filtering. */ get removed by the directional filtering. */
filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])( filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[(by * ystride << bsize) + (bx << bsize)], ystride, &y[bi << 2*bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
od_adjust_thresh(threshold, var), dir[by][bx]); od_adjust_thresh(threshold, var), dir[by][bx]);
} }
@@ -358,19 +358,19 @@ void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
by = bskip[bi][0]; by = bskip[bi][0];
bx = bskip[bi][1]; bx = bskip[bi][1];
filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])( filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[(by * ystride << bsize) + (bx << bsize)], ystride, &y[bi << 2*bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold, &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
dir[by][bx]); dir[by][bx]);
} }
} }
copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, ystride, bskip, dering_count, copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, bskip, dering_count,
bsize); bsize);
for (bi = 0; bi < dering_count; bi++) { for (bi = 0; bi < dering_count; bi++) {
by = bskip[bi][0]; by = bskip[bi][0];
bx = bskip[bi][1]; bx = bskip[bi][1];
if (filter2_thresh[by][bx] == 0) continue; if (filter2_thresh[by][bx] == 0) continue;
(filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])( (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
&y[(by * ystride << bsize) + (bx << bsize)], ystride, &y[bi << 2*bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx], &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
dir[by][bx]); dir[by][bx]);
} }

View File

@@ -36,10 +36,10 @@ typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride,
typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride, typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
const int16_t *in, const int16_t *in,
int threshold, int dir); int threshold, int dir);
void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride, void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src,
unsigned char (*bskip)[2], int dering_count, int bsize); unsigned char (*bskip)[2], int dering_count, int bsize);
void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride, void od_dering(int16_t *y, const od_dering_in *x, int xstride,
int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec, int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
unsigned char (*bskip)[2], int skip_stride, int threshold, unsigned char (*bskip)[2], int skip_stride, int threshold,

View File

@@ -98,6 +98,7 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
int best_gi; int best_gi;
int32_t best_mse = INT32_MAX; int32_t best_mse = INT32_MAX;
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]; int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
int16_t tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc); nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr); nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count)) if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
@@ -115,13 +116,15 @@ int av1_dering_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
sbc * bsize[0] * MAX_MIB_SIZE + c]; sbc * bsize[0] * MAX_MIB_SIZE + c];
} }
} }
od_dering(dst, MAX_MIB_SIZE * bsize[0], od_dering(tmp_dst,
&src[sbr * stride * bsize[0] * MAX_MIB_SIZE + &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
sbc * bsize[0] * MAX_MIB_SIZE], sbc * bsize[0] * MAX_MIB_SIZE],
cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
dir, 0, dir, 0,
bskip, bskip,
dering_count, threshold, coeff_shift); dering_count, threshold, coeff_shift);
copy_blocks_16bit(dst, MAX_MIB_SIZE * bsize[0], tmp_dst, bskip,
dering_count, 3);
cur_mse = (int)compute_dist( cur_mse = (int)compute_dist(
dst, MAX_MIB_SIZE * bsize[0], dst, MAX_MIB_SIZE * bsize[0],
&ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE + &ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +