vpx/vp10/encoder/ethread.c

/*
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "vp10/encoder/encodeframe.h"
#include "vp10/encoder/encoder.h"
#include "vp10/encoder/ethread.h"
#include "vpx_dsp/vpx_dsp_common.h"

static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
  int i, j, k, l, m, n;

  for (i = 0; i < REFERENCE_MODES; i++)
    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];

  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];

  for (i = 0; i < TX_SIZES; i++)
    for (j = 0; j < PLANE_TYPES; j++)
      for (k = 0; k < REF_TYPES; k++)
        for (l = 0; l < COEF_BANDS; l++)
          for (m = 0; m < COEFF_CONTEXTS; m++)
            for (n = 0; n < ENTROPY_TOKENS; n++)
              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];


  // Counts of all motion searches and exhuastive mesh searches.
  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
}

static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
  VP10_COMP *const cpi = thread_data->cpi;
  const VP10_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
  const int tile_rows = 1 << cm->log2_tile_rows;
  int t;

  (void) unused;

  for (t = thread_data->start; t < tile_rows * tile_cols;
      t += cpi->num_workers) {
    int tile_row = t / tile_cols;
    int tile_col = t % tile_cols;

    vp10_encode_tile(cpi, thread_data->td, tile_row, tile_col);
  }

  return 0;
}

void vp10_encode_tiles_mt(VP10_COMP *cpi) {
  VP10_COMMON *const cm = &cpi->common;
  const int tile_cols = 1 << cm->log2_tile_cols;
  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
  int i;

  vp10_init_tile_data(cpi);

  // Only run once to create threads and allocate thread data.
  if (cpi->num_workers == 0) {
    int allocated_workers = num_workers;

    CHECK_MEM_ERROR(cm, cpi->workers,
                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));

    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
                    vpx_calloc(allocated_workers,
                    sizeof(*cpi->tile_thr_data)));

    for (i = 0; i < allocated_workers; i++) {
      VPxWorker *const worker = &cpi->workers[i];
      EncWorkerData *thread_data = &cpi->tile_thr_data[i];

      ++cpi->num_workers;
      winterface->init(worker);

      if (i < allocated_workers - 1) {
        thread_data->cpi = cpi;

        // Allocate thread data.
        CHECK_MEM_ERROR(cm, thread_data->td,
                        vpx_memalign(32, sizeof(*thread_data->td)));
        vp10_zero(*thread_data->td);

        // Set up pc_tree.
        thread_data->td->leaf_tree = NULL;
        thread_data->td->pc_tree = NULL;
        vp10_setup_pc_tree(cm, thread_data->td);

        // Allocate frame counters in thread data.
        CHECK_MEM_ERROR(cm, thread_data->td->counts,
                        vpx_calloc(1, sizeof(*thread_data->td->counts)));

        // Create threads
        if (!winterface->reset(worker))
          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                             "Tile encoder thread creation failed");
      } else {
        // Main thread acts as a worker and uses the thread data in cpi.
        thread_data->cpi = cpi;
        thread_data->td = &cpi->td;
      }

      winterface->sync(worker);
    }
  }

  for (i = 0; i < num_workers; i++) {
    VPxWorker *const worker = &cpi->workers[i];
    EncWorkerData *thread_data;

    worker->hook = (VPxWorkerHook)enc_worker_hook;
    worker->data1 = &cpi->tile_thr_data[i];
    worker->data2 = NULL;
    thread_data = (EncWorkerData*)worker->data1;

    // Before encoding a frame, copy the thread data from cpi.
    if (thread_data->td != &cpi->td) {
      thread_data->td->mb = cpi->td.mb;
      thread_data->td->rd_counts = cpi->td.rd_counts;
    }
    if (thread_data->td->counts != &cpi->common.counts) {
      memcpy(thread_data->td->counts, &cpi->common.counts,
             sizeof(cpi->common.counts));
    }
  }

  // Encode a frame
  for (i = 0; i < num_workers; i++) {
    VPxWorker *const worker = &cpi->workers[i];
    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;

    // Set the starting tile for each thread.
    thread_data->start = i;

    if (i == cpi->num_workers - 1)
      winterface->execute(worker);
    else
      winterface->launch(worker);
  }

  // Encoding ends.
  for (i = 0; i < num_workers; i++) {
    VPxWorker *const worker = &cpi->workers[i];
    winterface->sync(worker);
  }

  for (i = 0; i < num_workers; i++) {
    VPxWorker *const worker = &cpi->workers[i];
    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;

    // Accumulate counters.
    if (i < cpi->num_workers - 1) {
      vp10_accumulate_frame_counts(cm, thread_data->td->counts, 0);
      accumulate_rd_opt(&cpi->td, thread_data->td);
    }
  }
}
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 04:00:31 +02:00			`/*`
			`* Copyright (c) 2014 The WebM project authors. All Rights Reserved.`
			`*`
			`* Use of this source code is governed by a BSD-style license`
			`* that can be found in the LICENSE file in the root of the source`
			`* tree. An additional intellectual property rights grant can be found`
			`* in the file PATENTS. All contributing project authors may`
			`* be found in the AUTHORS file in the root of the source tree.`
			`*/`

Remove vp9_ prefix from vp10 files Remove the vp9_ prefix from vp10 file names. Change-Id: I513a211b286a57d6126fc1b0fbfd6405120014f1 2015-08-07 06:14:07 +02:00			`#include "vp10/encoder/encodeframe.h"`
			`#include "vp10/encoder/encoder.h"`
			`#include "vp10/encoder/ethread.h"`
Include vpx_dsp_common.h when using VPXMIN/MAX Change-Id: I2e387a06484a06301f3cd6600c4ba2f4335b61ee 2015-08-31 23:36:35 +02:00			`#include "vpx_dsp/vpx_dsp_common.h"`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 04:00:31 +02:00
			`static void accumulate_rd_opt(ThreadData td, ThreadData td_t) {`
			`int i, j, k, l, m, n;`

			`for (i = 0; i < REFERENCE_MODES; i++)`
			`td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];`

			`for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)`
			`td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];`

			`for (i = 0; i < TX_SIZES; i++)`
			`for (j = 0; j < PLANE_TYPES; j++)`
			`for (k = 0; k < REF_TYPES; k++)`
			`for (l = 0; l < COEF_BANDS; l++)`
			`for (m = 0; m < COEFF_CONTEXTS; m++)`
			`for (n = 0; n < ENTROPY_TOKENS; n++)`
			`td->rd_counts.coef_counts[i][j][k][l][m][n] +=`
			`td_t->rd_counts.coef_counts[i][j][k][l][m][n];`
Changes to exhaustive motion search. This change has been imported from VP9 and alters the nature and use of exhaustive motion search. Firstly any exhaustive search is preceded by a normal step search. The exhaustive search is only carried out if the distortion resulting from the step search is above a threshold value. Secondly the simple +/- 64 exhaustive search is replaced by a multi stage mesh based search where each stage has a range and step/interval size. Subsequent stages use the best position from the previous stage as the center of the search but use a reduced range and interval size. For example: stage 1: Range +/- 64 interval 4 stage 2: Range +/- 32 interval 2 stage 3: Range +/- 15 interval 1 This process, especially when it follows on from a normal step search, has shown itself to be almost as effective as a full range exhaustive search with step 1 but greatly lowers the computational complexity such that it can be used in some cases for speeds 0-2. This patch also removes a double exhaustive search for sub 8x8 blocks which also contained a bug (the two searches used different distortion metrics). For best quality in my test animation sequence this patch has almost no impact on quality but improves encode speed by more than 5X. Restricted use in good quality speeds 0-2 yields significant quality gains on the animation test of 0.2 - 0.5 db with only a small impact on encode speed. On most natural video clips, however, where the step search is performing well, the quality gain and speed impact are small. Change-Id: Iac24152ae239f42a246f39ee5f00fe62d193cb98 2015-12-08 16:48:24 +01:00

			`// Counts of all motion searches and exhuastive mesh searches.`
			`td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;`
			`td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 04:00:31 +02:00			`}`

			`static int enc_worker_hook(EncWorkerData const thread_data, void unused) {`
VP9_COMP -> VP10_COMP Change-Id: I83b5c69621f9f28b742e5b13517d4e5b99c6cd26 2015-08-13 18:42:27 +02:00			`VP10_COMP *const cpi = thread_data->cpi;`
VP9_COMMON -> VP10->COMMON Change-Id: I651b7bee90f33581368853da81f9622805ccc0ea 2015-08-13 18:36:53 +02:00			`const VP10_COMMON *const cm = &cpi->common;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 04:00:31 +02:00			`const int tile_cols = 1 << cm->log2_tile_cols;`
			`const int tile_rows = 1 << cm->log2_tile_rows;`
			`int t;`

			`(void) unused;`

			`for (t = thread_data->start; t < tile_rows * tile_cols;`
			`t += cpi->num_workers) {`
			`int tile_row = t / tile_cols;`
			`int tile_col = t % tile_cols;`

			`vp10_encode_tile(cpi, thread_data->td, tile_row, tile_col);`
			`}`

			`return 0;`
			`}`

VP9_COMP -> VP10_COMP Change-Id: I83b5c69621f9f28b742e5b13517d4e5b99c6cd26 2015-08-13 18:42:27 +02:00			`void vp10_encode_tiles_mt(VP10_COMP *cpi) {`
VP9_COMMON -> VP10->COMMON Change-Id: I651b7bee90f33581368853da81f9622805ccc0ea 2015-08-13 18:36:53 +02:00			`VP10_COMMON *const cm = &cpi->common;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 04:00:31 +02:00			`const int tile_cols = 1 << cm->log2_tile_cols;`
			`const VPxWorkerInterface *const winterface = vpx_get_worker_interface();`
vpx_dsp_common: add VPX prefix to MIN/MAX prevents redeclaration warnings; vp8 has its own define which will be resolved in a future commit Change-Id: Ic941fef3dd4262fcdce48b73075fe6b375f11c9c 2015-08-18 03:19:22 +02:00			`const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 04:00:31 +02:00			`int i;`

			`vp10_init_tile_data(cpi);`

			`// Only run once to create threads and allocate thread data.`
			`if (cpi->num_workers == 0) {`
			`int allocated_workers = num_workers;`

			`CHECK_MEM_ERROR(cm, cpi->workers,`
			`vpx_malloc(allocated_workers * sizeof(*cpi->workers)));`

			`CHECK_MEM_ERROR(cm, cpi->tile_thr_data,`
			`vpx_calloc(allocated_workers,`
			`sizeof(*cpi->tile_thr_data)));`

			`for (i = 0; i < allocated_workers; i++) {`
			`VPxWorker *const worker = &cpi->workers[i];`
			`EncWorkerData *thread_data = &cpi->tile_thr_data[i];`

			`++cpi->num_workers;`
			`winterface->init(worker);`

			`if (i < allocated_workers - 1) {`
			`thread_data->cpi = cpi;`

			`// Allocate thread data.`
			`CHECK_MEM_ERROR(cm, thread_data->td,`
			`vpx_memalign(32, sizeof(*thread_data->td)));`
			`vp10_zero(*thread_data->td);`

			`// Set up pc_tree.`
			`thread_data->td->leaf_tree = NULL;`
			`thread_data->td->pc_tree = NULL;`
			`vp10_setup_pc_tree(cm, thread_data->td);`

			`// Allocate frame counters in thread data.`
			`CHECK_MEM_ERROR(cm, thread_data->td->counts,`
			`vpx_calloc(1, sizeof(*thread_data->td->counts)));`

			`// Create threads`
			`if (!winterface->reset(worker))`
			`vpx_internal_error(&cm->error, VPX_CODEC_ERROR,`
			`"Tile encoder thread creation failed");`
			`} else {`
			`// Main thread acts as a worker and uses the thread data in cpi.`
			`thread_data->cpi = cpi;`
			`thread_data->td = &cpi->td;`
			`}`

			`winterface->sync(worker);`
			`}`
			`}`

			`for (i = 0; i < num_workers; i++) {`
			`VPxWorker *const worker = &cpi->workers[i];`
			`EncWorkerData *thread_data;`

			`worker->hook = (VPxWorkerHook)enc_worker_hook;`
			`worker->data1 = &cpi->tile_thr_data[i];`
			`worker->data2 = NULL;`
			`thread_data = (EncWorkerData*)worker->data1;`

			`// Before encoding a frame, copy the thread data from cpi.`
			`if (thread_data->td != &cpi->td) {`
			`thread_data->td->mb = cpi->td.mb;`
			`thread_data->td->rd_counts = cpi->td.rd_counts;`
			`}`
			`if (thread_data->td->counts != &cpi->common.counts) {`
			`memcpy(thread_data->td->counts, &cpi->common.counts,`
			`sizeof(cpi->common.counts));`
			`}`
			`}`

			`// Encode a frame`
			`for (i = 0; i < num_workers; i++) {`
			`VPxWorker *const worker = &cpi->workers[i];`
			`EncWorkerData const thread_data = (EncWorkerData)worker->data1;`

			`// Set the starting tile for each thread.`
			`thread_data->start = i;`

			`if (i == cpi->num_workers - 1)`
			`winterface->execute(worker);`
			`else`
			`winterface->launch(worker);`
			`}`

			`// Encoding ends.`
			`for (i = 0; i < num_workers; i++) {`
			`VPxWorker *const worker = &cpi->workers[i];`
			`winterface->sync(worker);`
			`}`

			`for (i = 0; i < num_workers; i++) {`
			`VPxWorker *const worker = &cpi->workers[i];`
			`EncWorkerData const thread_data = (EncWorkerData)worker->data1;`

			`// Accumulate counters.`
			`if (i < cpi->num_workers - 1) {`
			`vp10_accumulate_frame_counts(cm, thread_data->td->counts, 0);`
			`accumulate_rd_opt(&cpi->td, thread_data->td);`
			`}`
			`}`
			`}`