From 32a9b63d2fa474f23abcc5fbc62f12048e38413f Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Tue, 30 Nov 2010 12:27:21 +0000 Subject: [PATCH] added better threads configuration estimator for the minMax, minMaxLoc, countNonZero functions in gpu module --- modules/gpu/src/arithm.cpp | 16 +++--- modules/gpu/src/cuda/mathfunc.cu | 86 ++++++++++++++++---------------- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 54b01d748..a55bc50ac 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -496,7 +496,7 @@ Scalar cv::gpu::sum(const GpuMat& src) namespace cv { namespace gpu { namespace mathfunc { namespace minmax { - void get_buf_size_required(int elem_size, int& cols, int& rows); + void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows); template void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf); @@ -551,7 +551,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp double maxVal_; if (!maxVal) maxVal = &maxVal_; Size bufSize; - get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height); + get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height); buf.create(bufSize, CV_8U); if (mask.empty()) @@ -574,8 +574,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { - void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, - int& b2cols, int& b2rows); + void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, + int& b1rows, int& b2cols, int& b2rows); template void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, @@ -636,8 +636,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point int maxLoc_[2]; Size valbuf_size, locbuf_size; - get_buf_size_required(src.elemSize(), valbuf_size.width, valbuf_size.height, - locbuf_size.width, locbuf_size.height); + get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width, + valbuf_size.height, locbuf_size.width, locbuf_size.height); valbuf.create(valbuf_size, CV_8U); locbuf.create(locbuf_size, CV_8U); @@ -663,7 +663,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { - void get_buf_size_required(int& cols, int& rows); + void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows); template int count_non_zero_caller(const DevMem2D src, PtrStep buf); @@ -697,7 +697,7 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice())); Size buf_size; - get_buf_size_required(buf_size.width, buf_size.height); + get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height); buf.create(buf_size, CV_8U); Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()]; diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index fe520321f..72775dd19 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -425,25 +425,25 @@ namespace cv { namespace gpu { namespace mathfunc // Estimates good thread configuration // - threads variable satisfies to threads.x * threads.y == 256 - void estimate_thread_cfg(dim3& threads, dim3& grid) + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) { - threads = dim3(64, 4); - grid = dim3(6, 5); + threads = dim3(32, 8); + grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); } // Returns required buffer sizes - void get_buf_size_required(int elem_size, int& cols, int& rows) + void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - cols = grid.x * grid.y * elem_size; - rows = 2; + estimate_thread_cfg(cols, rows, threads, grid); + bufcols = grid.x * grid.y * elem_size; + bufrows = 2; } // Estimates device constants which are used in the kernels using specified thread configuration - void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); @@ -567,8 +567,8 @@ namespace cv { namespace gpu { namespace mathfunc void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); @@ -596,8 +596,8 @@ namespace cv { namespace gpu { namespace mathfunc void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); @@ -650,8 +650,8 @@ namespace cv { namespace gpu { namespace mathfunc void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); @@ -679,8 +679,8 @@ namespace cv { namespace gpu { namespace mathfunc void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)buf.ptr(0); T* maxval_buf = (T*)buf.ptr(1); @@ -719,19 +719,19 @@ namespace cv { namespace gpu { namespace mathfunc // Estimates good thread configuration // - threads variable satisfies to threads.x * threads.y == 256 - void estimate_thread_cfg(dim3& threads, dim3& grid) + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) { - threads = dim3(64, 4); - grid = dim3(6, 5); + threads = dim3(32, 8); + grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); } // Returns required buffer sizes - void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, - int& b2cols, int& b2rows) + void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, + int& b1rows, int& b2cols, int& b2rows) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); + estimate_thread_cfg(cols, rows, threads, grid); b1cols = grid.x * grid.y * elem_size; // For values b1rows = 2; b2cols = grid.x * grid.y * sizeof(int); // For locations @@ -740,7 +740,7 @@ namespace cv { namespace gpu { namespace mathfunc // Estimates device constants which are used in the kernels using specified thread configuration - void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); @@ -886,8 +886,8 @@ namespace cv { namespace gpu { namespace mathfunc int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); @@ -924,8 +924,8 @@ namespace cv { namespace gpu { namespace mathfunc int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); @@ -994,8 +994,8 @@ namespace cv { namespace gpu { namespace mathfunc int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); @@ -1032,8 +1032,8 @@ namespace cv { namespace gpu { namespace mathfunc int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); T* minval_buf = (T*)valbuf.ptr(0); T* maxval_buf = (T*)valbuf.ptr(1); @@ -1077,23 +1077,23 @@ namespace cv { namespace gpu { namespace mathfunc __device__ unsigned int blocks_finished = 0; - void estimate_thread_cfg(dim3& threads, dim3& grid) + void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid) { - threads = dim3(64, 4); - grid = dim3(6, 5); + threads = dim3(32, 8); + grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); } - void get_buf_size_required(int& cols, int& rows) + void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - cols = grid.x * grid.y * sizeof(int); - rows = 1; + estimate_thread_cfg(cols, rows, threads, grid); + bufcols = grid.x * grid.y * sizeof(int); + bufrows = 1; } - void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) { int twidth = divUp(divUp(cols, grid.x), threads.x); int theight = divUp(divUp(rows, grid.y), threads.y); @@ -1182,8 +1182,8 @@ namespace cv { namespace gpu { namespace mathfunc int count_non_zero_caller(const DevMem2D src, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); unsigned int* count_buf = (unsigned int*)buf.ptr(0); @@ -1226,8 +1226,8 @@ namespace cv { namespace gpu { namespace mathfunc int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf) { dim3 threads, grid; - estimate_thread_cfg(threads, grid); - estimate_kernel_consts(src.cols, src.rows, threads, grid); + estimate_thread_cfg(src.cols, src.rows, threads, grid); + set_kernel_consts(src.cols, src.rows, threads, grid); unsigned int* count_buf = (unsigned int*)buf.ptr(0);