diff --git a/vp8/common/threading.h b/vp8/common/threading.h index ece64f3fb..b082bf109 100644 --- a/vp8/common/threading.h +++ b/vp8/common/threading.h @@ -191,47 +191,18 @@ static inline int sem_destroy(sem_t *sem) { #define x86_pause_hint() #endif -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define USE_MUTEX_LOCK 1 -#endif -#endif - #include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_atomics.h" -static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) { - (void)mutex; -#if defined(USE_MUTEX_LOCK) - int ret; - pthread_mutex_lock(mutex); - ret = *p; - pthread_mutex_unlock(mutex); - return ret; -#endif - return *p; -} - -static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col, - const int *last_row_current_mb_col, - const int nsync) { - while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) { +static INLINE void vp8_atomic_spin_wait( + int mb_col, const vpx_atomic_int *last_row_current_mb_col, + const int nsync) { + while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) { x86_pause_hint(); thread_sleep(0); } } -static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) { - (void)mutex; -#if defined(USE_MUTEX_LOCK) - pthread_mutex_lock(mutex); - *p = v; - pthread_mutex_unlock(mutex); - return; -#endif - *p = v; -} - -#undef USE_MUTEX_LOCK #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ #ifdef __cplusplus diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index d900b670d..077bd3da2 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -1205,7 +1205,8 @@ int vp8_decode_frame(VP8D_COMP *pbi) { pbi->frame_corrupt_residual = 0; #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) && + pc->multi_token_partition != ONE_PARTITION) { unsigned int thread; vp8mt_decode_mb_rows(pbi, xd); vp8_yv12_extend_frame_borders(yv12_fb_new); diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index d05368544..5ecacdbb9 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -68,7 +68,7 @@ typedef struct VP8D_COMP { #if CONFIG_MULTITHREAD /* variable for threading */ - int b_multithreaded_rd; + vpx_atomic_int b_multithreaded_rd; int max_threads; int current_mb_col_main; unsigned int decoding_thread_count; @@ -76,9 +76,8 @@ typedef struct VP8D_COMP { int mt_baseline_filter_level[MAX_MB_SEGMENTS]; int sync_range; - int *mt_current_mb_col; /* Each row remembers its already decoded column. */ - pthread_mutex_t *pmutex; - pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */ + /* Each row remembers its already decoded column. */ + vpx_atomic_int *mt_current_mb_col; unsigned char **mt_yabove_row; /* mb_rows x width */ unsigned char **mt_uabove_row; diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index f5bdae493..aadc8dc71 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -79,7 +79,8 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8; } - for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1; + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1); } static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, @@ -247,12 +248,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row) { - const int *last_row_current_mb_col; - int *current_mb_col; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col; int mb_row; VP8_COMMON *pc = &pbi->common; const int nsync = pbi->sync_range; - const int first_row_no_sync_above = pc->mb_cols + nsync; + const vpx_atomic_int first_row_no_sync_above = + VPX_ATOMIC_INIT(pc->mb_cols + nsync); int num_part = 1 << pbi->common.multi_token_partition; int last_mb_row = start_mb_row; @@ -356,13 +358,11 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &pbi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } /* Distance of MB to the various image edges. @@ -548,7 +548,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, } /* last MB of row is ready just after extension is done */ - protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync); + vpx_atomic_store_release(current_mb_col, mb_col + nsync); ++xd->mode_info_context; /* skip prediction column */ xd->up_available = 1; @@ -568,10 +568,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break; + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break; if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) { break; } else { MACROBLOCKD *xd = &mbrd->mbd; @@ -589,9 +589,8 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { int core_count = 0; unsigned int ithread; - pbi->b_multithreaded_rd = 0; + vpx_atomic_init(&pbi->b_multithreaded_rd, 0); pbi->allocated_decoding_thread_count = 0; - pthread_mutex_init(&pbi->mt_mutex, NULL); /* limit decoding threads to the max number of token partitions */ core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; @@ -602,7 +601,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { } if (core_count > 1) { - pbi->b_multithreaded_rd = 1; + vpx_atomic_init(&pbi->b_multithreaded_rd, 1); pbi->decoding_thread_count = core_count - 1; CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count); @@ -648,16 +647,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) { int i; - /* De-allocate mutex */ - if (pbi->pmutex != NULL) { - for (i = 0; i < mb_rows; ++i) { - pthread_mutex_destroy(&pbi->pmutex[i]); - } - - vpx_free(pbi->pmutex); - pbi->pmutex = NULL; - } - vpx_free(pbi->mt_current_mb_col); pbi->mt_current_mb_col = NULL; @@ -723,7 +712,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { int i; int uv_width; - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows); /* our internal buffers are always multiples of 16 */ @@ -741,17 +730,11 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { uv_width = width >> 1; - /* Allocate mutex */ - CHECK_MEM_ERROR(pbi->pmutex, - vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows)); - if (pbi->pmutex) { - for (i = 0; i < pc->mb_rows; ++i) { - pthread_mutex_init(&pbi->pmutex[i], NULL); - } - } - - /* Allocate an int for each mb row. */ - CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows); + /* Allocate a vpx_atomic_int for each mb row. */ + CHECK_MEM_ERROR(pbi->mt_current_mb_col, + vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows)); + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_init(&pbi->mt_current_mb_col[i], 0); /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); @@ -792,9 +775,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { void vp8_decoder_remove_threads(VP8D_COMP *pbi) { /* shutdown MB Decoding thread; */ - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { int i; - protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0); + vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0); /* allow all threads to exit */ for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { @@ -824,7 +807,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) { vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows); } - pthread_mutex_destroy(&pbi->mt_mutex); } void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index 7086faae9..d7a17b749 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -1416,7 +1416,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { pack_mb_row_tokens(cpi, &cpi->bc[1]); } else { vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count); diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index b867f6cb1..9bb0df72d 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -341,11 +341,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, #if CONFIG_MULTITHREAD const int nsync = cpi->mt_sync_range; - const int rightmost_col = cm->mb_cols + nsync; - const int *last_row_current_mb_col; - int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync); + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; - if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) { last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; } else { last_row_current_mb_col = &rightmost_col; @@ -415,15 +415,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } } #endif @@ -563,8 +561,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) { - protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col); + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + vpx_atomic_store_release(current_mb_col, + vpx_atomic_load_acquire(&rightmost_col)); } #endif @@ -749,13 +748,14 @@ void vp8_encode_frame(VP8_COMP *cpi) { vpx_usec_timer_start(&emr_timer); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { int i; vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, cpi->encoding_thread_count); - for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1; + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); for (i = 0; i < cpi->encoding_thread_count; ++i) { sem_post(&cpi->h_event_start_encoding[i]); diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 3e5b709e0..55a1528b1 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -26,11 +26,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) { VP8_COMMON *cm = &cpi->common; while (1) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_lpf) == 0) { /* we're shutting down */ - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; vp8_loopfilter_frame(cpi, cm); @@ -48,7 +48,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { const int nsync = cpi->mt_sync_range; @@ -66,7 +66,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int *totalrate = &mbri->totalrate; /* we're shutting down */ - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1); xd->mode_info_stride = cm->mode_info_stride; @@ -80,8 +80,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; int map_index = (mb_row * cm->mb_cols); - const int *last_row_current_mb_col; - int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)]; @@ -108,13 +108,11 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { /* for each macroblock col in image */ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING @@ -286,7 +284,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); - protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync); + vpx_atomic_store_release(current_mb_col, mb_col + nsync); /* this is to account for the border */ xd->mode_info_context++; @@ -490,12 +488,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, int vp8cx_create_encoder_threads(VP8_COMP *cpi) { const VP8_COMMON *cm = &cpi->common; - cpi->b_multi_threaded = 0; + vpx_atomic_init(&cpi->b_multi_threaded, 0); cpi->encoding_thread_count = 0; cpi->b_lpf_running = 0; - pthread_mutex_init(&cpi->mt_mutex, NULL); - if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { int ithread; int th_count = cpi->oxcf.multi_threaded - 1; @@ -526,7 +522,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); - cpi->b_multi_threaded = 1; + vpx_atomic_store_release(&cpi->b_multi_threaded, 1); cpi->encoding_thread_count = th_count; /* @@ -555,7 +551,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { pthread_join(cpi->h_encoding_thread[ithread], 0); sem_destroy(&cpi->h_event_start_encoding[ithread]); @@ -569,8 +565,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); - pthread_mutex_destroy(&cpi->mt_mutex); - return -1; } @@ -585,7 +579,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { sem_post(&cpi->h_event_start_encoding[ithread]); sem_post(&cpi->h_event_end_encoding[ithread]); @@ -603,8 +597,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); - pthread_mutex_destroy(&cpi->mt_mutex); - return -2; } } @@ -613,9 +605,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { } void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); { int i; @@ -643,6 +635,5 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); } - pthread_mutex_destroy(&cpi->mt_mutex); } #endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index f68fa22af..725e000e2 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -451,18 +451,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) { cpi->mb.pip = 0; #if CONFIG_MULTITHREAD - /* De-allocate mutex */ - if (cpi->pmutex != NULL) { - VP8_COMMON *const pc = &cpi->common; - int i; - - for (i = 0; i < pc->mb_rows; ++i) { - pthread_mutex_destroy(&cpi->pmutex[i]); - } - vpx_free(cpi->pmutex); - cpi->pmutex = NULL; - } - vpx_free(cpi->mt_current_mb_col); cpi->mt_current_mb_col = NULL; #endif @@ -1153,9 +1141,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { int width = cm->Width; int height = cm->Height; -#if CONFIG_MULTITHREAD - int prev_mb_rows = cm->mb_rows; -#endif if (vp8_alloc_frame_buffers(cm, width, height)) { vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, @@ -1247,26 +1232,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { if (cpi->oxcf.multi_threaded > 1) { int i; - /* De-allocate and re-allocate mutex */ - if (cpi->pmutex != NULL) { - for (i = 0; i < prev_mb_rows; ++i) { - pthread_mutex_destroy(&cpi->pmutex[i]); - } - vpx_free(cpi->pmutex); - cpi->pmutex = NULL; - } - - CHECK_MEM_ERROR(cpi->pmutex, - vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows)); - if (cpi->pmutex) { - for (i = 0; i < cm->mb_rows; ++i) { - pthread_mutex_init(&cpi->pmutex[i], NULL); - } - } - vpx_free(cpi->mt_current_mb_col); CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_init(&cpi->mt_current_mb_col[i], 0); } #endif @@ -3274,7 +3244,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { } #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ } #endif @@ -4471,7 +4441,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* start loopfilter in separate thread */ sem_post(&cpi->h_event_start_lpf); cpi->b_lpf_running = 1; @@ -4497,7 +4467,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #if CONFIG_MULTITHREAD /* wait that filter_level is picked so that we can continue with stream * packing */ - if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf); + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) + sem_wait(&cpi->h_event_end_lpf); #endif /* build the bitstream */ @@ -5341,7 +5312,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, #if CONFIG_MULTITHREAD /* wait for the lpf thread done */ - if (cpi->b_multi_threaded && cpi->b_lpf_running) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { sem_wait(&cpi->h_event_end_lpf); cpi->b_lpf_running = 0; } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 08f07851e..0ee2d3553 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -518,11 +518,9 @@ typedef struct VP8_COMP { #if CONFIG_MULTITHREAD /* multithread data */ - pthread_mutex_t *pmutex; - pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */ - int *mt_current_mb_col; + vpx_atomic_int *mt_current_mb_col; int mt_sync_range; - int b_multi_threaded; + vpx_atomic_int b_multi_threaded; int encoding_thread_count; int b_lpf_running; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 987a5b8a4..292877252 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -415,7 +415,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, #endif #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows); } #else diff --git a/vpx_util/vpx_atomics.h b/vpx_util/vpx_atomics.h new file mode 100644 index 000000000..a471fd186 --- /dev/null +++ b/vpx_util/vpx_atomics.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_UTIL_VPX_ATOMICS_H_ +#define VPX_UTIL_VPX_ATOMICS_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD + +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) || \ + (defined(__cplusplus) && __cplusplus >= 201112L) +// Where available, use +#include +#define VPX_USE_STD_ATOMIC +#else +// Look for built-ins. +#if !defined(__has_builtin) +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif // !defined(__has_builtin) + +#if (__has_builtin(__atomic_load_n)) || \ + (defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) +// For GCC >= 4.7 and Clang that support __atomic builtins, use those. +#define VPX_USE_ATOMIC_BUILTINS +#else +// Use platform-specific asm barriers. +#if defined(_MSC_VER) +// TODO(pbos): This assumes that newer versions of MSVC are building with the +// default /volatile:ms (or older, where this is always true. Consider adding +// support for using instead of stdatomic.h when building C++11 under +// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?), +// there're no explicit Interlocked* functions for only storing or loading +// (presumably because volatile has historically implied that on MSVC). +// +// For earlier versions of MSVC or the default /volatile:ms volatile int are +// acquire/release and require no barrier. +#define vpx_atomic_memory_barrier() \ + do { \ + } while (0) +#else +#if ARCH_X86 || ARCH_X86_64 +// Use a compiler barrier on x86, no runtime penalty. +#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory") +#elif ARCH_ARM +#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory") +#elif ARCH_MIPS +#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory") +#else +#error Unsupported architecture! +#endif // ARCH_X86 || ARCH_X86_64 +#endif // defined(_MSC_VER) +#endif // atomic builtin availability check +#endif // stdatomic availability check + +// These are wrapped in a struct so that they are not easily accessed directly +// on any platform (to discourage programmer errors by setting values directly). +// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT +// (NOT memset) and accessed through vpx_atomic_ functions. +typedef struct vpx_atomic_int { +#if defined(VPX_USE_STD_ATOMIC) + atomic_int value; +#else + volatile int value; +#endif // defined(USE_STD_ATOMIC) +} vpx_atomic_int; + +#if defined(VPX_USE_STD_ATOMIC) +#define VPX_ATOMIC_INIT(num) \ + { ATOMIC_VAR_INIT(num) } +#else +#define VPX_ATOMIC_INIT(num) \ + { num } +#endif // defined(VPX_USE_STD_ATOMIC) + +// Initialization of an atomic int, not thread safe. +static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) { +#if defined(VPX_USE_STD_ATOMIC) + atomic_init(&atomic->value, value); +#else + atomic->value = value; +#endif // defined(USE_STD_ATOMIC) +} + +static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) { +#if defined(VPX_USE_STD_ATOMIC) + atomic_store_explicit(&atomic->value, value, memory_order_release); +#elif defined(VPX_USE_ATOMIC_BUILTINS) + __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE); +#else + vpx_atomic_memory_barrier(); + atomic->value = value; +#endif // defined(VPX_USE_STD_ATOMIC) +} + +static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) { +#if defined(VPX_USE_STD_ATOMIC) + // const_cast (in C) that doesn't trigger -Wcast-qual. + return atomic_load_explicit( + (atomic_int *)(uintptr_t)(const void *)&atomic->value, + memory_order_acquire); +#elif defined(VPX_USE_ATOMIC_BUILTINS) + return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE); +#else + int v = atomic->value; + vpx_atomic_memory_barrier(); + return v; +#endif // defined(VPX_USE_STD_ATOMIC) +} + +#undef VPX_USE_STD_ATOMIC +#undef VPX_USE_ATOMIC_BUILTINS +#undef vpx_atomic_memory_barrier + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_UTIL_VPX_ATOMICS_H_ diff --git a/vpx_util/vpx_util.mk b/vpx_util/vpx_util.mk index d48e4cc2f..86d3ece3c 100644 --- a/vpx_util/vpx_util.mk +++ b/vpx_util/vpx_util.mk @@ -8,6 +8,7 @@ ## be found in the AUTHORS file in the root of the source tree. ## +UTIL_SRCS-yes += vpx_atomics.h UTIL_SRCS-yes += vpx_util.mk UTIL_SRCS-yes += vpx_thread.c UTIL_SRCS-yes += vpx_thread.h