vpx/vp8/decoder/threading.c
John Koleszar 3085025fa1 Remove secondary mv clamping from decode stage
This patch removes the secondary MV clamping from the MV decoder. This
behavior was consistent with limits placed on non-split MVs by the
reference encoder, but was inconsistent with the MVs generated in the
split case.

The purpose of this secondary clamping was only to prevent crashes on
invalid data. It was not intended to be a behaviour an encoder could or
should rely on. Instead of doing additional clamping in a way that
changes the entropy context, the secondary clamp is removed and the
border handling is made implmentation specific. With respect to the
spec, the border is treated as essentially infinite, limited only by
the clamping performed on the near/nearest reference and the maximum
encodable magnitude of the residual MV.

This does not affect any currently produced streams.

Change-Id: I68d35a2fbb51570d6569eab4ad233961405230a3
2010-06-09 11:47:24 -04:00

600 lines
19 KiB
C

/*
* Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef WIN32
# include <unistd.h>
#endif
#include "onyxd_int.h"
#include "vpx_mem/vpx_mem.h"
#include "threading.h"
#include "loopfilter.h"
#include "extend.h"
#include "vpx_ports/vpx_timer.h"
extern void vp8_decode_mb_row(VP8D_COMP *pbi,
VP8_COMMON *pc,
int mb_row,
MACROBLOCKD *xd);
extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
extern void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd);
void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
{
#if CONFIG_MULTITHREAD
VP8_COMMON *const pc = & pbi->common;
int i, j;
for (i = 0; i < count; i++)
{
MACROBLOCKD *mbd = &mbrd[i].mbd;
#if CONFIG_RUNTIME_CPU_DETECT
mbd->rtcd = xd->rtcd;
#endif
mbd->subpixel_predict = xd->subpixel_predict;
mbd->subpixel_predict8x4 = xd->subpixel_predict8x4;
mbd->subpixel_predict8x8 = xd->subpixel_predict8x8;
mbd->subpixel_predict16x16 = xd->subpixel_predict16x16;
mbd->gf_active_ptr = xd->gf_active_ptr;
mbd->mode_info = pc->mi - 1;
mbd->mode_info_context = pc->mi + pc->mode_info_stride * (i + 1);
mbd->mode_info_stride = pc->mode_info_stride;
mbd->frame_type = pc->frame_type;
mbd->frames_since_golden = pc->frames_since_golden;
mbd->frames_till_alt_ref_frame = pc->frames_till_alt_ref_frame;
mbd->pre = pc->last_frame;
mbd->dst = pc->new_frame;
vp8_setup_block_dptrs(mbd);
vp8_build_block_doffsets(mbd);
mbd->segmentation_enabled = xd->segmentation_enabled;
mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
mbd->mbmi.mode = DC_PRED;
mbd->mbmi.uv_mode = DC_PRED;
mbd->current_bc = &pbi->bc2;
for (j = 0; j < 25; j++)
{
mbd->block[j].dequant = xd->block[j].dequant;
}
}
#else
(void) pbi;
(void) xd;
(void) mbrd;
(void) count;
#endif
}
THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
{
#if CONFIG_MULTITHREAD
int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
ENTROPY_CONTEXT mb_row_left_context[4][4];
while (1)
{
if (pbi->b_multithreaded_rd == 0)
break;
//if(WaitForSingleObject(pbi->h_event_mbrdecoding[ithread], INFINITE) == WAIT_OBJECT_0)
if (sem_wait(&pbi->h_event_mbrdecoding[ithread]) == 0)
{
if (pbi->b_multithreaded_rd == 0)
break;
else
{
VP8_COMMON *pc = &pbi->common;
int mb_row = mbrd->mb_row;
MACROBLOCKD *xd = &mbrd->mbd;
//printf("ithread:%d mb_row %d\n", ithread, mb_row);
int i;
int recon_yoffset, recon_uvoffset;
int mb_col;
int recon_y_stride = pc->last_frame.y_stride;
int recon_uv_stride = pc->last_frame.uv_stride;
volatile int *last_row_current_mb_col;
if (ithread > 0)
last_row_current_mb_col = &pbi->mb_row_di[ithread-1].current_mb_col;
else
last_row_current_mb_col = &pbi->current_mb_col_main;
recon_yoffset = mb_row * recon_y_stride * 16;
recon_uvoffset = mb_row * recon_uv_stride * 8;
// reset above block coeffs
xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
xd->left_context = mb_row_left_context;
vpx_memset(mb_row_left_context, 0, sizeof(mb_row_left_context));
xd->up_available = (mb_row != 0);
xd->mb_to_top_edge = -((mb_row * 16)) << 3;
xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
{
while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != pc->mb_cols - 1)
{
x86_pause_hint();
thread_sleep(0);
}
// Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
// the partition_bmi array is unused in the decoder, so don't copy it.
vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi,
sizeof(MB_MODE_INFO) - sizeof(xd->mbmi.partition_bmi));
if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
{
for (i = 0; i < 16; i++)
{
BLOCKD *d = &xd->block[i];
vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
}
}
// Distance of Mb to the various image edges.
// These specified to 8th pel as they are always compared to values that are in 1/8th pel units
xd->mb_to_left_edge = -((mb_col * 16) << 3);
xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
xd->dst.y_buffer = pc->new_frame.y_buffer + recon_yoffset;
xd->dst.u_buffer = pc->new_frame.u_buffer + recon_uvoffset;
xd->dst.v_buffer = pc->new_frame.v_buffer + recon_uvoffset;
xd->left_available = (mb_col != 0);
// Select the appropriate reference frame for this MB
if (xd->mbmi.ref_frame == LAST_FRAME)
{
xd->pre.y_buffer = pc->last_frame.y_buffer + recon_yoffset;
xd->pre.u_buffer = pc->last_frame.u_buffer + recon_uvoffset;
xd->pre.v_buffer = pc->last_frame.v_buffer + recon_uvoffset;
}
else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
{
// Golden frame reconstruction buffer
xd->pre.y_buffer = pc->golden_frame.y_buffer + recon_yoffset;
xd->pre.u_buffer = pc->golden_frame.u_buffer + recon_uvoffset;
xd->pre.v_buffer = pc->golden_frame.v_buffer + recon_uvoffset;
}
else
{
// Alternate reference frame reconstruction buffer
xd->pre.y_buffer = pc->alt_ref_frame.y_buffer + recon_yoffset;
xd->pre.u_buffer = pc->alt_ref_frame.u_buffer + recon_uvoffset;
xd->pre.v_buffer = pc->alt_ref_frame.v_buffer + recon_uvoffset;
}
vp8_build_uvmvs(xd, pc->full_pixel);
vp8dx_bool_decoder_fill(xd->current_bc);
vp8_decode_macroblock(pbi, xd);
recon_yoffset += 16;
recon_uvoffset += 8;
++xd->mode_info_context; /* next mb */
xd->gf_active_ptr++; // GF useage flag for next MB
xd->above_context[Y1CONTEXT] += 4;
xd->above_context[UCONTEXT ] += 2;
xd->above_context[VCONTEXT ] += 2;
xd->above_context[Y2CONTEXT] ++;
pbi->mb_row_di[ithread].current_mb_col = mb_col;
}
// adjust to the next row of mbs
vp8_extend_mb_row(
&pc->new_frame,
xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
);
++xd->mode_info_context; /* skip prediction column */
// since we have multithread
xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
//memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
if ((mb_row & 1) == 1)
{
pbi->last_mb_row_decoded = mb_row;
//printf("S%d", pbi->last_mb_row_decoded);
}
if (ithread == (pbi->decoding_thread_count - 1) || mb_row == pc->mb_rows - 1)
{
//SetEvent(pbi->h_event_main);
sem_post(&pbi->h_event_main);
}
}
}
}
#else
(void) p_data;
#endif
return 0 ;
}
THREAD_FUNCTION vp8_thread_loop_filter(void *p_data)
{
#if CONFIG_MULTITHREAD
VP8D_COMP *pbi = (VP8D_COMP *)p_data;
while (1)
{
if (pbi->b_multithreaded_lf == 0)
break;
//printf("before waiting for start_lpf\n");
//if(WaitForSingleObject(pbi->h_event_start_lpf, INFINITE) == WAIT_OBJECT_0)
if (sem_wait(&pbi->h_event_start_lpf) == 0)
{
if (pbi->b_multithreaded_lf == 0) // we're shutting down
break;
else
{
VP8_COMMON *cm = &pbi->common;
MACROBLOCKD *mbd = &pbi->lpfmb;
int default_filt_lvl = pbi->common.filter_level;
YV12_BUFFER_CONFIG *post = &cm->new_frame;
loop_filter_info *lfi = cm->lf_info;
int mb_row;
int mb_col;
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
int i;
unsigned char *y_ptr, *u_ptr, *v_ptr;
volatile int *last_mb_row_decoded = &pbi->last_mb_row_decoded;
//MODE_INFO * this_mb_mode_info = cm->mi;
mbd->mode_info_context = cm->mi; // Point at base of Mb MODE_INFO list
// Note the baseline filter values for each segment
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; // Clamp to valid range
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
// Initialize the loop filter for this frame.
vp8_init_loop_filter(cm);
// Set up the buffer pointers
y_ptr = post->y_buffer;
u_ptr = post->u_buffer;
v_ptr = post->v_buffer;
// vp8_filter each macro block
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
{
while (mb_row >= *last_mb_row_decoded)
{
x86_pause_hint();
thread_sleep(0);
}
//printf("R%d", mb_row);
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
filter_level = baseline_filter_level[Segment];
// Apply any context driven MB level adjustment
vp8_adjust_mb_lf_value(mbd, &filter_level);
if (filter_level)
{
if (mb_col > 0)
cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
// don't apply across umv border
if (mb_row > 0)
cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
}
y_ptr += 16;
u_ptr += 8;
v_ptr += 8;
mbd->mode_info_context++; // step to next MB
}
y_ptr += post->y_stride * 16 - post->y_width;
u_ptr += post->uv_stride * 8 - post->uv_width;
v_ptr += post->uv_stride * 8 - post->uv_width;
mbd->mode_info_context++; // Skip border mb
}
//printf("R%d\n", mb_row);
// When done, signal main thread that ME is finished
//SetEvent(pbi->h_event_lpf);
sem_post(&pbi->h_event_lpf);
}
}
}
#else
(void) p_data;
#endif
return 0;
}
void vp8_decoder_create_threads(VP8D_COMP *pbi)
{
#if CONFIG_MULTITHREAD
int core_count = 0;
int ithread;
pbi->b_multithreaded_rd = 0;
pbi->b_multithreaded_lf = 0;
pbi->allocated_decoding_thread_count = 0;
core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads; //vp8_get_proc_core_count();
if (core_count > 1)
{
sem_init(&pbi->h_event_lpf, 0, 0);
sem_init(&pbi->h_event_start_lpf, 0, 0);
pbi->b_multithreaded_lf = 1;
pthread_create(&pbi->h_thread_lpf, 0, vp8_thread_loop_filter, (pbi));
}
if (core_count > 1)
{
pbi->b_multithreaded_rd = 1;
pbi->decoding_thread_count = core_count - 1;
CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
CHECK_MEM_ERROR(pbi->h_event_mbrdecoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
{
sem_init(&pbi->h_event_mbrdecoding[ithread], 0, 0);
pbi->de_thread_data[ithread].ithread = ithread;
pbi->de_thread_data[ithread].ptr1 = (void *)pbi;
pbi->de_thread_data[ithread].ptr2 = (void *) &pbi->mb_row_di[ithread];
pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
}
sem_init(&pbi->h_event_main, 0, 0);
pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
}
#else
(void) pbi;
#endif
}
void vp8_decoder_remove_threads(VP8D_COMP *pbi)
{
#if CONFIG_MULTITHREAD
if (pbi->b_multithreaded_lf)
{
pbi->b_multithreaded_lf = 0;
sem_post(&pbi->h_event_start_lpf);
pthread_join(pbi->h_thread_lpf, 0);
sem_destroy(&pbi->h_event_start_lpf);
}
//shutdown MB Decoding thread;
if (pbi->b_multithreaded_rd)
{
pbi->b_multithreaded_rd = 0;
// allow all threads to exit
{
int i;
for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
{
sem_post(&pbi->h_event_mbrdecoding[i]);
pthread_join(pbi->h_decoding_thread[i], NULL);
}
}
{
int i;
for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
{
sem_destroy(&pbi->h_event_mbrdecoding[i]);
}
}
sem_destroy(&pbi->h_event_main);
if (pbi->h_decoding_thread)
{
vpx_free(pbi->h_decoding_thread);
pbi->h_decoding_thread = NULL;
}
if (pbi->h_event_mbrdecoding)
{
vpx_free(pbi->h_event_mbrdecoding);
pbi->h_event_mbrdecoding = NULL;
}
if (pbi->mb_row_di)
{
vpx_free(pbi->mb_row_di);
pbi->mb_row_di = NULL ;
}
if (pbi->de_thread_data)
{
vpx_free(pbi->de_thread_data);
pbi->de_thread_data = NULL;
}
}
#else
(void) pbi;
#endif
}
void vp8_start_lfthread(VP8D_COMP *pbi)
{
#if CONFIG_MULTITHREAD
memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
pbi->last_mb_row_decoded = 0;
sem_post(&pbi->h_event_start_lpf);
#else
(void) pbi;
#endif
}
void vp8_stop_lfthread(VP8D_COMP *pbi)
{
#if CONFIG_MULTITHREAD
struct vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
sem_wait(&pbi->h_event_lpf);
vpx_usec_timer_mark(&timer);
pbi->time_loop_filtering += vpx_usec_timer_elapsed(&timer);
#else
(void) pbi;
#endif
}
void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
MACROBLOCKD *xd)
{
#if CONFIG_MULTITHREAD
int mb_row;
VP8_COMMON *pc = &pbi->common;
int ibc = 0;
int num_part = 1 << pbi->common.multi_token_partition;
vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
{
int i;
pbi->current_mb_col_main = -1;
xd->current_bc = &pbi->mbc[ibc];
ibc++ ;
if (ibc == num_part)
ibc = 0;
for (i = 0; i < pbi->decoding_thread_count; i++)
{
if ((mb_row + i + 1) >= pc->mb_rows)
break;
pbi->mb_row_di[i].mb_row = mb_row + i + 1;
pbi->mb_row_di[i].mbd.current_bc = &pbi->mbc[ibc];
ibc++;
if (ibc == num_part)
ibc = 0;
pbi->mb_row_di[i].current_mb_col = -1;
sem_post(&pbi->h_event_mbrdecoding[i]);
}
vp8_decode_mb_row(pbi, pc, mb_row, xd);
xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
if (mb_row < pc->mb_rows - 1)
{
sem_wait(&pbi->h_event_main);
}
}
pbi->last_mb_row_decoded = mb_row;
#else
(void) pbi;
(void) xd;
#endif
}