vpx/vp9/decoder/vp9_dthread.h
Yunqing Wang 903801f1ef vp9 decoder: row-based multi-threaded loopfilter
Implemented parallel loopfiltering, which uses existing tile-
decoding threads. Each thread works on one row, and when that row
is loopfiltered, it moves to next unattended row. To ensure the
correct filtering order, threads are synchronized and one
superblock is filtered only if the superblocks it depends on are
filtered already.

To reduce synchronization overhead and speed up the decoder, we use
nsync > 1 for high resolution.

Performance tests:
1. on desktop:
8-tile 4k video using 8 threads, speedup: 70% - 80%
4-tile HD video using 4 threads, speedup: ~35%
2. on mobile device(Nexus 7):
4-tile 1080p video using 4 threads, speedup: 18% - 25%
4-tile 1080p video using 2 threads, speedup: 10% - 15%

Change-Id: If54b4a11960dd706c22d5ad145ad94156031f36a
2014-01-31 14:44:53 -08:00

61 lines
2.0 KiB
C

/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_DECODER_VP9_DTHREAD_H_
#define VP9_DECODER_VP9_DTHREAD_H_
#include "./vpx_config.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/decoder/vp9_reader.h"
#include "vp9/decoder/vp9_thread.h"
struct macroblockd;
struct VP9Common;
struct VP9Decompressor;
typedef struct TileWorkerData {
struct VP9Common *cm;
vp9_reader bit_reader;
DECLARE_ALIGNED(16, struct macroblockd, xd);
DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
// Row-based parallel loopfilter data
LFWorkerData lfdata;
} TileWorkerData;
// Loopfilter row synchronization
typedef struct VP9LfSyncData {
#if CONFIG_MULTITHREAD
pthread_mutex_t *mutex_;
pthread_cond_t *cond_;
#endif
// Allocate memory to store the loop-filtered superblock index in each row.
int *cur_sb_col;
// The optimal sync_range for different resolution and platform should be
// determined by testing. Currently, it is chosen to be a power-of-2 number.
int sync_range;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
void vp9_loop_filter_alloc(struct VP9Common *cm, struct VP9LfSyncData *lf_sync,
int rows, int width);
// Deallocate loopfilter synchronization related mutex and data.
void vp9_loop_filter_dealloc(struct VP9LfSyncData *lf_sync, int rows);
// Multi-threaded loopfilter that uses the tile threads.
void vp9_loop_filter_frame_mt(struct VP9Decompressor *pbi,
struct VP9Common *cm,
struct macroblockd *xd,
int frame_filter_level,
int y_only, int partial);
#endif // VP9_DECODER_VP9_DTHREAD_H_