From 2d0bea4719588aa9caa3f452596b9748ba13059e Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Fri, 24 Apr 2015 11:04:00 -0400
Subject: [PATCH] vp9: increase buffer sizes for non-420 chroma subsamplings.

---
 libavcodec/vp9.c | 67 ++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index bd852d4245..0b11bdc177 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -215,7 +215,7 @@ typedef struct VP9Context {
     DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
     DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
     DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
-    DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
+    DECLARE_ALIGNED(16, uint8_t, left_uv_nnz_ctx)[2][16];
     DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
     DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
     DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
@@ -248,8 +248,8 @@ typedef struct VP9Context {
     int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
     uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
     struct { int x, y; } min_mv, max_mv;
-    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
-    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
+    DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64];
+    DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64];
     uint16_t mvscale[3][2];
     uint8_t mvstep[3][2];
 } VP9Context;
@@ -307,39 +307,42 @@ static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
     return 0;
 }
 
-static int update_size(AVCodecContext *ctx, int w, int h)
+static int update_size(AVCodecContext *ctx, int w, int h, enum AVPixelFormat fmt)
 {
     VP9Context *s = ctx->priv_data;
     uint8_t *p;
 
     av_assert0(w > 0 && h > 0);
 
-    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
+    if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height && ctx->pix_fmt == fmt)
         return 0;
 
-    ctx->width  = w;
-    ctx->height = h;
-    s->sb_cols  = (w + 63) >> 6;
-    s->sb_rows  = (h + 63) >> 6;
-    s->cols     = (w + 7) >> 3;
-    s->rows     = (h + 7) >> 3;
+    ctx->width   = w;
+    ctx->height  = h;
+    ctx->pix_fmt = fmt;
+    s->sb_cols   = (w + 63) >> 6;
+    s->sb_rows   = (h + 63) >> 6;
+    s->cols      = (w + 7) >> 3;
+    s->rows      = (h + 7) >> 3;
 
 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
     av_freep(&s->intra_pred_data[0]);
-    p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+    // FIXME we slightly over-allocate here for subsampled chroma, but a little
+    // bit of padding shouldn't affect performance...
+    p = av_malloc(s->sb_cols * (320 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
     if (!p)
         return AVERROR(ENOMEM);
     assign(s->intra_pred_data[0],  uint8_t *,             64);
-    assign(s->intra_pred_data[1],  uint8_t *,             32);
-    assign(s->intra_pred_data[2],  uint8_t *,             32);
+    assign(s->intra_pred_data[1],  uint8_t *,             64);
+    assign(s->intra_pred_data[2],  uint8_t *,             64);
     assign(s->above_y_nnz_ctx,     uint8_t *,             16);
     assign(s->above_mode_ctx,      uint8_t *,             16);
     assign(s->above_mv_ctx,        VP56mv(*)[2],          16);
+    assign(s->above_uv_nnz_ctx[0], uint8_t *,             16);
+    assign(s->above_uv_nnz_ctx[1], uint8_t *,             16);
     assign(s->above_partition_ctx, uint8_t *,              8);
     assign(s->above_skip_ctx,      uint8_t *,              8);
     assign(s->above_txfm_ctx,      uint8_t *,              8);
-    assign(s->above_uv_nnz_ctx[0], uint8_t *,              8);
-    assign(s->above_uv_nnz_ctx[1], uint8_t *,              8);
     assign(s->above_segpred_ctx,   uint8_t *,              8);
     assign(s->above_intra_ctx,     uint8_t *,              8);
     assign(s->above_comp_ctx,      uint8_t *,              8);
@@ -358,34 +361,39 @@ static int update_size(AVCodecContext *ctx, int w, int h)
 static int update_block_buffers(AVCodecContext *ctx)
 {
     VP9Context *s = ctx->priv_data;
+    int chroma_blocks, chroma_eobs;
 
     if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->frames[CUR_FRAME].uses_2pass)
         return 0;
 
     av_free(s->b_base);
     av_free(s->block_base);
+    chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
+    chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
     if (s->frames[CUR_FRAME].uses_2pass) {
         int sbs = s->sb_cols * s->sb_rows;
 
         s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
+        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+                                    16 * 16 + 2 * chroma_eobs) * sbs);
         if (!s->b_base || !s->block_base)
             return AVERROR(ENOMEM);
         s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
-        s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
-        s->uveob_base[0] = s->eob_base + 256 * sbs;
-        s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
+        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks);
+        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
     } else {
         s->b_base = av_malloc(sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 128) * 3);
+        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * sizeof(int16_t) +
+                                   16 * 16 + 2 * chroma_eobs);
         if (!s->b_base || !s->block_base)
             return AVERROR(ENOMEM);
         s->uvblock_base[0] = s->block_base + 64 * 64;
-        s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
-        s->uveob_base[0] = s->eob_base + 256;
-        s->uveob_base[1] = s->uveob_base[0] + 64;
+        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks;
+        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks);
+        s->uveob_base[0] = s->eob_base + 16 * 16;
+        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
     }
     s->block_alloc_using_2pass = s->frames[CUR_FRAME].uses_2pass;
 
@@ -772,8 +780,8 @@ static int decode_frame_header(AVCodecContext *ctx,
     }
 
     /* tiling info */
-    if ((res = update_size(ctx, w, h)) < 0) {
-        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
+    if ((res = update_size(ctx, w, h, fmt)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d @ %d\n", w, h, fmt);
         return res;
     }
     for (s->tiling.log2_tile_cols = 0;
@@ -3960,7 +3968,7 @@ static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
                             memset(s->left_mode_ctx, NEARESTMV, 8);
                         }
                         memset(s->left_y_nnz_ctx, 0, 16);
-                        memset(s->left_uv_nnz_ctx, 0, 16);
+                        memset(s->left_uv_nnz_ctx, 0, 32);
                         memset(s->left_segpred_ctx, 0, 8);
 
                         memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
@@ -4089,7 +4097,6 @@ static av_cold int vp9_decode_init(AVCodecContext *ctx)
     VP9Context *s = ctx->priv_data;
 
     ctx->internal->allocate_progress = 1;
-    ctx->pix_fmt = AV_PIX_FMT_YUV420P;
     ff_vp9dsp_init(&s->dsp);
     ff_videodsp_init(&s->vdsp, 8);
     s->filter.sharpness = -1;