avcodec/hevc: reduce memory for SAO

cherry picked from commit 5d9f79edef2c11b915bdac3a025b59a32082f409

SAO edge filter uses pre-SAO pixel data on the left and top of the ctb, so
this data must be kept available. This was done previously by having 2
copies of the frame, one before and one after SAO.

This commit reduces the storage to just that, instead of the previous whole
frame.

Commit message taken from patch by Christophe Gisquet <christophe.gisquet@gmail.com>

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Fabrice Bellard 2015-01-12 23:09:23 +01:00 committed by Michael Niedermayer
parent b737a2c528
commit da81cc38e8
3 changed files with 224 additions and 14 deletions

View File

@ -104,7 +104,8 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height); s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height);
s->tab_ipm = av_mallocz(min_pu_size); s->tab_ipm = av_mallocz(min_pu_size);
s->is_pcm = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1); s->is_pcm = av_mallocz_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm) if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
goto fail; goto fail;
@ -353,9 +354,34 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps)
ff_videodsp_init (&s->vdsp, sps->bit_depth); ff_videodsp_init (&s->vdsp, sps->bit_depth);
if (sps->sao_enabled && !s->avctx->hwaccel) { if (sps->sao_enabled && !s->avctx->hwaccel) {
#ifdef USE_SAO_SMALL_BUFFER
{
int ctb_size = 1 << sps->log2_ctb_size;
int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
int c_idx, i;
for (i = 0; i < s->threads_number ; i++) {
HEVCLocalContext *lc = s->HEVClcList[i];
lc->sao_pixel_buffer =
av_malloc(((ctb_size + 2) * (ctb_size + 2)) <<
sps->pixel_shift);
}
for(c_idx = 0; c_idx < c_count; c_idx++) {
int w = sps->width >> sps->hshift[c_idx];
int h = sps->height >> sps->vshift[c_idx];
s->sao_pixel_buffer_h[c_idx] =
av_malloc((w * 2 * sps->ctb_height) <<
sps->pixel_shift);
s->sao_pixel_buffer_v[c_idx] =
av_malloc((h * 2 * sps->ctb_width) <<
sps->pixel_shift);
}
}
#else
av_frame_unref(s->tmp_frame); av_frame_unref(s->tmp_frame);
ret = get_buffer_sao(s, s->tmp_frame, sps); ret = get_buffer_sao(s, s->tmp_frame, sps);
s->sao_frame = s->tmp_frame; s->sao_frame = s->tmp_frame;
#endif
} }
s->sps = sps; s->sps = sps;
@ -3186,7 +3212,17 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
av_freep(&s->cabac_state); av_freep(&s->cabac_state);
#ifdef USE_SAO_SMALL_BUFFER
for (i = 0; i < s->threads_number; i++) {
av_freep(&s->HEVClcList[i]->sao_pixel_buffer);
}
for (i = 0; i < 3; i++) {
av_freep(&s->sao_pixel_buffer_h[i]);
av_freep(&s->sao_pixel_buffer_v[i]);
}
#else
av_frame_free(&s->tmp_frame); av_frame_free(&s->tmp_frame);
#endif
av_frame_free(&s->output_frame); av_frame_free(&s->output_frame);
for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@ -3246,9 +3282,11 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
if (!s->cabac_state) if (!s->cabac_state)
goto fail; goto fail;
#ifndef USE_SAO_SMALL_BUFFER
s->tmp_frame = av_frame_alloc(); s->tmp_frame = av_frame_alloc();
if (!s->tmp_frame) if (!s->tmp_frame)
goto fail; goto fail;
#endif
s->output_frame = av_frame_alloc(); s->output_frame = av_frame_alloc();
if (!s->output_frame) if (!s->output_frame)

View File

@ -36,6 +36,8 @@
#include "thread.h" #include "thread.h"
#include "videodsp.h" #include "videodsp.h"
//#define USE_SAO_SMALL_BUFFER /* reduce the memory used by SAO */
#define MAX_DPB_SIZE 16 // A.4.1 #define MAX_DPB_SIZE 16 // A.4.1
#define MAX_REFS 16 #define MAX_REFS 16
@ -745,6 +747,9 @@ typedef struct HEVCNAL {
} HEVCNAL; } HEVCNAL;
typedef struct HEVCLocalContext { typedef struct HEVCLocalContext {
#ifdef USE_SAO_SMALL_BUFFER
uint8_t *sao_pixel_buffer;
#endif
uint8_t cabac_state[HEVC_CONTEXTS]; uint8_t cabac_state[HEVC_CONTEXTS];
uint8_t stat_coeff[4]; uint8_t stat_coeff[4];
@ -807,9 +812,14 @@ typedef struct HEVCContext {
uint8_t slice_initialized; uint8_t slice_initialized;
AVFrame *frame; AVFrame *frame;
AVFrame *sao_frame;
AVFrame *tmp_frame;
AVFrame *output_frame; AVFrame *output_frame;
#ifdef USE_SAO_SMALL_BUFFER
uint8_t *sao_pixel_buffer_h[3];
uint8_t *sao_pixel_buffer_v[3];
#else
AVFrame *tmp_frame;
AVFrame *sao_frame;
#endif
const HEVCVPS *vps; const HEVCVPS *vps;
const HEVCSPS *sps; const HEVCSPS *sps;

View File

@ -139,7 +139,7 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
return s->qp_y_tab[x + y * s->sps->min_cb_width]; return s->qp_y_tab[x + y * s->sps->min_cb_width];
} }
static void copy_CTB(uint8_t *dst, uint8_t *src, int width, int height, static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
intptr_t stride_dst, intptr_t stride_src) intptr_t stride_dst, intptr_t stride_src)
{ {
int i, j; int i, j;
@ -161,13 +161,65 @@ int i, j;
} }
} }
static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int height, int c_idx) #if defined(USE_SAO_SMALL_BUFFER)
static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
{
if (pixel_shift)
*(uint16_t *)dst = *(uint16_t *)src;
else
*dst = *src;
}
static void copy_vert(uint8_t *dst, const uint8_t *src,
int pixel_shift, int height,
int stride_dst, int stride_src)
{
int i;
if (pixel_shift == 0) {
for (i = 0; i < height; i++) {
*dst = *src;
dst += stride_dst;
src += stride_src;
}
} else {
for (i = 0; i < height; i++) {
*(uint16_t *)dst = *(uint16_t *)src;
dst += stride_dst;
src += stride_src;
}
}
}
static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
int stride_src, int x, int y, int width, int height,
int c_idx, int x_ctb, int y_ctb)
{
int sh = s->sps->pixel_shift;
int w = s->sps->width >> s->sps->hshift[c_idx];
int h = s->sps->height >> s->sps->vshift[c_idx];
/* copy horizontal edges */
memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
src, width << sh);
memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
src + stride_src * (height - 1), width << sh);
/* copy vertical edges */
copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
}
#endif
static void restore_tqb_pixels(HEVCContext *s,
uint8_t *src1, const uint8_t *dst1,
ptrdiff_t stride_src, ptrdiff_t stride_dst,
int x0, int y0, int width, int height, int c_idx)
{ {
if ( s->pps->transquant_bypass_enable_flag || if ( s->pps->transquant_bypass_enable_flag ||
(s->sps->pcm.loop_filter_disable_flag && s->sps->pcm_enabled_flag)) { (s->sps->pcm.loop_filter_disable_flag && s->sps->pcm_enabled_flag)) {
int x, y; int x, y;
ptrdiff_t stride_dst = s->sao_frame->linesize[c_idx];
ptrdiff_t stride_src = s->frame->linesize[c_idx];
int min_pu_size = 1 << s->sps->log2_min_pu_size; int min_pu_size = 1 << s->sps->log2_min_pu_size;
int hshift = s->sps->hshift[c_idx]; int hshift = s->sps->hshift[c_idx];
int vshift = s->sps->vshift[c_idx]; int vshift = s->sps->vshift[c_idx];
@ -175,13 +227,13 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he
int y_min = ((y0 ) >> s->sps->log2_min_pu_size); int y_min = ((y0 ) >> s->sps->log2_min_pu_size);
int x_max = ((x0 + width ) >> s->sps->log2_min_pu_size); int x_max = ((x0 + width ) >> s->sps->log2_min_pu_size);
int y_max = ((y0 + height) >> s->sps->log2_min_pu_size); int y_max = ((y0 + height) >> s->sps->log2_min_pu_size);
int len = min_pu_size >> hshift; int len = (min_pu_size >> hshift) << s->sps->pixel_shift;
for (y = y_min; y < y_max; y++) { for (y = y_min; y < y_max; y++) {
for (x = x_min; x < x_max; x++) { for (x = x_min; x < x_max; x++) {
if (s->is_pcm[y * s->sps->min_pu_width + x]) { if (s->is_pcm[y * s->sps->min_pu_width + x]) {
int n; int n;
uint8_t *src = &s->frame->data[c_idx][ ((y << s->sps->log2_min_pu_size) >> vshift) * stride_src + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)]; uint8_t *src = src1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift);
uint8_t *dst = &s->sao_frame->data[c_idx][((y << s->sps->log2_min_pu_size) >> vshift) * stride_dst + (((x << s->sps->log2_min_pu_size) >> hshift) << s->sps->pixel_shift)]; const uint8_t *dst = dst1 + (((y << s->sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->sps->log2_min_pu_size) - x0) >> hshift) << s->sps->pixel_shift);
for (n = 0; n < (min_pu_size >> vshift); n++) { for (n = 0; n < (min_pu_size >> vshift); n++) {
memcpy(src, dst, len); memcpy(src, dst, len);
src += stride_src; src += stride_src;
@ -198,6 +250,7 @@ static void restore_tqb_pixels(HEVCContext *s, int x0, int y0, int width, int he
static void sao_filter_CTB(HEVCContext *s, int x, int y) static void sao_filter_CTB(HEVCContext *s, int x, int y)
{ {
static const uint8_t band_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; static const uint8_t band_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
HEVCLocalContext *lc = s->HEVClc;
int c_idx; int c_idx;
int edges[4]; // 0 left 1 top 2 right 3 bottom int edges[4]; // 0 left 1 top 2 right 3 bottom
int x_ctb = x >> s->sps->log2_ctb_size; int x_ctb = x >> s->sps->log2_ctb_size;
@ -258,27 +311,132 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
int x0 = x >> s->sps->hshift[c_idx]; int x0 = x >> s->sps->hshift[c_idx];
int y0 = y >> s->sps->vshift[c_idx]; int y0 = y >> s->sps->vshift[c_idx];
int stride_src = s->frame->linesize[c_idx]; int stride_src = s->frame->linesize[c_idx];
int stride_dst = s->sao_frame->linesize[c_idx];
int ctb_size_h = (1 << (s->sps->log2_ctb_size)) >> s->sps->hshift[c_idx]; int ctb_size_h = (1 << (s->sps->log2_ctb_size)) >> s->sps->hshift[c_idx];
int ctb_size_v = (1 << (s->sps->log2_ctb_size)) >> s->sps->vshift[c_idx]; int ctb_size_v = (1 << (s->sps->log2_ctb_size)) >> s->sps->vshift[c_idx];
int width = FFMIN(ctb_size_h, (s->sps->width >> s->sps->hshift[c_idx]) - x0); int width = FFMIN(ctb_size_h, (s->sps->width >> s->sps->hshift[c_idx]) - x0);
int height = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0); int height = FFMIN(ctb_size_v, (s->sps->height >> s->sps->vshift[c_idx]) - y0);
int tab = band_tab[(FFALIGN(width, 8) >> 3) - 1]; int tab = band_tab[(FFALIGN(width, 8) >> 3) - 1];
uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)]; uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->sps->pixel_shift)];
#if defined(USE_SAO_SMALL_BUFFER)
int stride_dst = ((1 << (s->sps->log2_ctb_size)) + 2) << s->sps->pixel_shift;
uint8_t *dst = lc->sao_pixel_buffer + (1 * stride_dst) + (1 << s->sps->pixel_shift);
#else
int stride_dst = s->sao_frame->linesize[c_idx];
uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride_dst + (x0 << s->sps->pixel_shift)]; uint8_t *dst = &s->sao_frame->data[c_idx][y0 * stride_dst + (x0 << s->sps->pixel_shift)];
#endif
switch (sao->type_idx[c_idx]) { switch (sao->type_idx[c_idx]) {
case SAO_BAND: case SAO_BAND:
copy_CTB(dst, src, width << s->sps->pixel_shift, height, stride_dst, stride_src); copy_CTB(dst, src, width << s->sps->pixel_shift, height, stride_dst, stride_src);
#if defined(USE_SAO_SMALL_BUFFER)
copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
x_ctb, y_ctb);
#endif
s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
sao->offset_val[c_idx], sao->band_position[c_idx], sao->offset_val[c_idx], sao->band_position[c_idx],
width, height); width, height);
restore_tqb_pixels(s, x, y, width, height, c_idx); restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
x, y, width, height, c_idx);
sao->type_idx[c_idx] = SAO_APPLIED; sao->type_idx[c_idx] = SAO_APPLIED;
break; break;
case SAO_EDGE: case SAO_EDGE:
{ {
uint8_t left_pixels = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] != SAO_APPLIED); #if defined(USE_SAO_SMALL_BUFFER)
int w = s->sps->width >> s->sps->hshift[c_idx];
int h = s->sps->height >> s->sps->vshift[c_idx];
int left_edge = edges[0];
int top_edge = edges[1];
int right_edge = edges[2];
int bottom_edge = edges[3];
int sh = s->sps->pixel_shift;
int left_pixels, right_pixels;
if (!top_edge) {
int left = 1 - left_edge;
int right = 1 - right_edge;
const uint8_t *src1[2];
uint8_t *dst1;
int src_idx, pos;
dst1 = dst - stride_dst - (left << sh);
src1[0] = src - stride_src - (left << sh);
src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
pos = 0;
if (left) {
src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
SAO_APPLIED);
copy_pixel(dst1, src1[src_idx], sh);
pos += (1 << sh);
}
src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
SAO_APPLIED);
memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
if (right) {
pos += width << sh;
src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
SAO_APPLIED);
copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
}
}
if (!bottom_edge) {
int left = 1 - left_edge;
int right = 1 - right_edge;
const uint8_t *src1[2];
uint8_t *dst1;
int src_idx, pos;
dst1 = dst + height * stride_dst - (left << sh);
src1[0] = src + height * stride_src - (left << sh);
src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
pos = 0;
if (left) {
src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
SAO_APPLIED);
copy_pixel(dst1, src1[src_idx], sh);
pos += (1 << sh);
}
src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
SAO_APPLIED);
memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
if (right) {
pos += width << sh;
src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
SAO_APPLIED);
copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
}
}
left_pixels = 0;
if (!left_edge) {
if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
copy_vert(dst - (1 << sh),
s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
sh, height, stride_dst, 1 << sh);
} else {
left_pixels = 1;
}
}
right_pixels = 0;
if (!right_edge) {
if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
copy_vert(dst + (width << sh),
s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
sh, height, stride_dst, 1 << sh);
} else {
right_pixels = 1;
}
}
copy_CTB(dst - (left_pixels << sh),
src - (left_pixels << sh),
(width + left_pixels + right_pixels) << sh,
height, stride_dst, stride_src);
copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
x_ctb, y_ctb);
#else
uint8_t left_pixels;
/* get the CTB edge pixels from the SAO pixel buffer */
left_pixels = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] != SAO_APPLIED);
if (!edges[1]) { if (!edges[1]) {
uint8_t top_left = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED); uint8_t top_left = !edges[0] && (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
uint8_t top_right = !edges[2] && (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED); uint8_t top_right = !edges[2] && (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] != SAO_APPLIED);
@ -306,6 +464,9 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
copy_CTB(dst - (left_pixels << s->sps->pixel_shift), copy_CTB(dst - (left_pixels << s->sps->pixel_shift),
src - (left_pixels << s->sps->pixel_shift), src - (left_pixels << s->sps->pixel_shift),
(width + 1 + left_pixels) << s->sps->pixel_shift, height, stride_dst, stride_src); (width + 1 + left_pixels) << s->sps->pixel_shift, height, stride_dst, stride_src);
#endif
/* XXX: could handle the restoration here to simplify the
DSP functions */
s->hevcdsp.sao_edge_filter[restore](src, dst, s->hevcdsp.sao_edge_filter[restore](src, dst,
stride_src, stride_dst, stride_src, stride_dst,
sao, sao,
@ -314,7 +475,8 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
vert_edge, vert_edge,
horiz_edge, horiz_edge,
diag_edge); diag_edge);
restore_tqb_pixels(s, x, y, width, height, c_idx); restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
x, y, width, height, c_idx);
sao->type_idx[c_idx] = SAO_APPLIED; sao->type_idx[c_idx] = SAO_APPLIED;
break; break;
} }