added better threads configuration estimator for the minMax, minMaxLoc, countNonZero functions in gpu module

This commit is contained in:
Alexey Spizhevoy 2010-11-30 12:27:21 +00:00
parent 3fd07809c9
commit 32a9b63d2f
2 changed files with 51 additions and 51 deletions

View File

@ -496,7 +496,7 @@ Scalar cv::gpu::sum(const GpuMat& src)
namespace cv { namespace gpu { namespace mathfunc { namespace minmax { namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
void get_buf_size_required(int elem_size, int& cols, int& rows); void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T> template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
@ -551,7 +551,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
double maxVal_; if (!maxVal) maxVal = &maxVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_;
Size bufSize; Size bufSize;
get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height); get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
buf.create(bufSize, CV_8U); buf.create(bufSize, CV_8U);
if (mask.empty()) if (mask.empty())
@ -574,8 +574,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b2cols, int& b2rows); int& b1rows, int& b2cols, int& b2rows);
template <typename T> template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
@ -636,8 +636,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
int maxLoc_[2]; int maxLoc_[2];
Size valbuf_size, locbuf_size; Size valbuf_size, locbuf_size;
get_buf_size_required(src.elemSize(), valbuf_size.width, valbuf_size.height, get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width,
locbuf_size.width, locbuf_size.height); valbuf_size.height, locbuf_size.width, locbuf_size.height);
valbuf.create(valbuf_size, CV_8U); valbuf.create(valbuf_size, CV_8U);
locbuf.create(locbuf_size, CV_8U); locbuf.create(locbuf_size, CV_8U);
@ -663,7 +663,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
void get_buf_size_required(int& cols, int& rows); void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
template <typename T> template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf); int count_non_zero_caller(const DevMem2D src, PtrStep buf);
@ -697,7 +697,7 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
Size buf_size; Size buf_size;
get_buf_size_required(buf_size.width, buf_size.height); get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
buf.create(buf_size, CV_8U); buf.create(buf_size, CV_8U);
Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];

View File

@ -425,25 +425,25 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration // Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256 // - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(dim3& threads, dim3& grid) void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(64, 4); threads = dim3(32, 8);
grid = dim3(6, 5); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
} }
// Returns required buffer sizes // Returns required buffer sizes
void get_buf_size_required(int elem_size, int& cols, int& rows) void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(cols, rows, threads, grid);
cols = grid.x * grid.y * elem_size; bufcols = grid.x * grid.y * elem_size;
rows = 2; bufrows = 2;
} }
// Estimates device constants which are used in the kernels using specified thread configuration // Estimates device constants which are used in the kernels using specified thread configuration
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
@ -567,8 +567,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
@ -596,8 +596,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
@ -650,8 +650,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
@ -679,8 +679,8 @@ namespace cv { namespace gpu { namespace mathfunc
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
@ -719,19 +719,19 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates good thread configuration // Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256 // - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(dim3& threads, dim3& grid) void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(64, 4); threads = dim3(32, 8);
grid = dim3(6, 5); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
} }
// Returns required buffer sizes // Returns required buffer sizes
void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b2cols, int& b2rows) int& b1rows, int& b2cols, int& b2rows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2; b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations b2cols = grid.x * grid.y * sizeof(int); // For locations
@ -740,7 +740,7 @@ namespace cv { namespace gpu { namespace mathfunc
// Estimates device constants which are used in the kernels using specified thread configuration // Estimates device constants which are used in the kernels using specified thread configuration
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
@ -886,8 +886,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
@ -924,8 +924,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
@ -994,8 +994,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
@ -1032,8 +1032,8 @@ namespace cv { namespace gpu { namespace mathfunc
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0); T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1); T* maxval_buf = (T*)valbuf.ptr(1);
@ -1077,23 +1077,23 @@ namespace cv { namespace gpu { namespace mathfunc
__device__ unsigned int blocks_finished = 0; __device__ unsigned int blocks_finished = 0;
void estimate_thread_cfg(dim3& threads, dim3& grid) void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{ {
threads = dim3(64, 4); threads = dim3(32, 8);
grid = dim3(6, 5); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
} }
void get_buf_size_required(int& cols, int& rows) void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(cols, rows, threads, grid);
cols = grid.x * grid.y * sizeof(int); bufcols = grid.x * grid.y * sizeof(int);
rows = 1; bufrows = 1;
} }
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{ {
int twidth = divUp(divUp(cols, grid.x), threads.x); int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y); int theight = divUp(divUp(rows, grid.y), threads.y);
@ -1182,8 +1182,8 @@ namespace cv { namespace gpu { namespace mathfunc
int count_non_zero_caller(const DevMem2D src, PtrStep buf) int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
unsigned int* count_buf = (unsigned int*)buf.ptr(0); unsigned int* count_buf = (unsigned int*)buf.ptr(0);
@ -1226,8 +1226,8 @@ namespace cv { namespace gpu { namespace mathfunc
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf) int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
{ {
dim3 threads, grid; dim3 threads, grid;
estimate_thread_cfg(threads, grid); estimate_thread_cfg(src.cols, src.rows, threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid); set_kernel_consts(src.cols, src.rows, threads, grid);
unsigned int* count_buf = (unsigned int*)buf.ptr(0); unsigned int* count_buf = (unsigned int*)buf.ptr(0);