From 32a9b63d2fa474f23abcc5fbc62f12048e38413f Mon Sep 17 00:00:00 2001
From: Alexey Spizhevoy <no@email>
Date: Tue, 30 Nov 2010 12:27:21 +0000
Subject: [PATCH] added better threads configuration estimator for the minMax,
 minMaxLoc, countNonZero functions in gpu module

---
 modules/gpu/src/arithm.cpp       | 16 +++---
 modules/gpu/src/cuda/mathfunc.cu | 86 ++++++++++++++++----------------
 2 files changed, 51 insertions(+), 51 deletions(-)
diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp
index 54b01d748..a55bc50ac 100644
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -496,7 +496,7 @@ Scalar cv::gpu::sum(const GpuMat& src)
 
 namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
 
-    void get_buf_size_required(int elem_size, int& cols, int& rows);
+    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
     
     template <typename T> 
     void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
@@ -551,7 +551,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
     double maxVal_; if (!maxVal) maxVal = &maxVal_;
     
     Size bufSize;
-    get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height);
+    get_buf_size_required(src.cols, src.rows, src.elemSize(), bufSize.width, bufSize.height);
     buf.create(bufSize, CV_8U);
 
     if (mask.empty())
@@ -574,8 +574,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 
 namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
 
-    void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, 
-                               int& b2cols, int& b2rows);
+    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
+                               int& b1rows, int& b2cols, int& b2rows);
 
     template <typename T> 
     void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, 
@@ -636,8 +636,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
     int maxLoc_[2];
 
     Size valbuf_size, locbuf_size;
-    get_buf_size_required(src.elemSize(), valbuf_size.width, valbuf_size.height, 
-                          locbuf_size.width, locbuf_size.height);
+    get_buf_size_required(src.cols, src.rows, src.elemSize(), valbuf_size.width, 
+                          valbuf_size.height, locbuf_size.width, locbuf_size.height);
     valbuf.create(valbuf_size, CV_8U);
     locbuf.create(locbuf_size, CV_8U);
 
@@ -663,7 +663,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 
 namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
 
-    void get_buf_size_required(int& cols, int& rows);
+    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows);
 
     template <typename T> 
     int count_non_zero_caller(const DevMem2D src, PtrStep buf);
@@ -697,7 +697,7 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
     CV_Assert(src.type() != CV_64F || hasNativeDoubleSupport(getDevice()));
 
     Size buf_size;
-    get_buf_size_required(buf_size.width, buf_size.height);
+    get_buf_size_required(src.cols, src.rows, buf_size.width, buf_size.height);
     buf.create(buf_size, CV_8U);
 
     Caller caller = callers[hasAtomicsSupport(getDevice())][src.type()];
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index fe520321f..72775dd19 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -425,25 +425,25 @@ namespace cv { namespace gpu { namespace mathfunc
 
     // Estimates good thread configuration
     //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimate_thread_cfg(dim3& threads, dim3& grid)
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
     {
-        threads = dim3(64, 4);
-        grid = dim3(6, 5);
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
     }
 
 
     // Returns required buffer sizes
-    void get_buf_size_required(int elem_size, int& cols, int& rows)
+    void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        cols = grid.x * grid.y * elem_size; 
-        rows = 2;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * elem_size; 
+        bufrows = 2;
     }
 
 
     // Estimates device constants which are used in the kernels using specified thread configuration
-    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
     {        
         int twidth = divUp(divUp(cols, grid.x), threads.x);
         int theight = divUp(divUp(rows, grid.y), threads.y);
@@ -567,8 +567,8 @@ namespace cv { namespace gpu { namespace mathfunc
     void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)buf.ptr(0);
         T* maxval_buf = (T*)buf.ptr(1);
@@ -596,8 +596,8 @@ namespace cv { namespace gpu { namespace mathfunc
     void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)buf.ptr(0);
         T* maxval_buf = (T*)buf.ptr(1);
@@ -650,8 +650,8 @@ namespace cv { namespace gpu { namespace mathfunc
     void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)buf.ptr(0);
         T* maxval_buf = (T*)buf.ptr(1);
@@ -679,8 +679,8 @@ namespace cv { namespace gpu { namespace mathfunc
     void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)buf.ptr(0);
         T* maxval_buf = (T*)buf.ptr(1);
@@ -719,19 +719,19 @@ namespace cv { namespace gpu { namespace mathfunc
 
     // Estimates good thread configuration
     //  - threads variable satisfies to threads.x * threads.y == 256
-    void estimate_thread_cfg(dim3& threads, dim3& grid)
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
     {
-        threads = dim3(64, 4);
-        grid = dim3(6, 5);
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
     }
 
 
     // Returns required buffer sizes
-    void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, 
-                               int& b2cols, int& b2rows)
+    void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols, 
+                               int& b1rows, int& b2cols, int& b2rows)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
+        estimate_thread_cfg(cols, rows, threads, grid);
         b1cols = grid.x * grid.y * elem_size; // For values
         b1rows = 2;
         b2cols = grid.x * grid.y * sizeof(int); // For locations
@@ -740,7 +740,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
 
     // Estimates device constants which are used in the kernels using specified thread configuration
-    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
     {        
         int twidth = divUp(divUp(cols, grid.x), threads.x);
         int theight = divUp(divUp(rows, grid.y), threads.y);
@@ -886,8 +886,8 @@ namespace cv { namespace gpu { namespace mathfunc
                                  int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)valbuf.ptr(0);
         T* maxval_buf = (T*)valbuf.ptr(1);
@@ -924,8 +924,8 @@ namespace cv { namespace gpu { namespace mathfunc
                             int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)valbuf.ptr(0);
         T* maxval_buf = (T*)valbuf.ptr(1);
@@ -994,8 +994,8 @@ namespace cv { namespace gpu { namespace mathfunc
                                            int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)valbuf.ptr(0);
         T* maxval_buf = (T*)valbuf.ptr(1);
@@ -1032,8 +1032,8 @@ namespace cv { namespace gpu { namespace mathfunc
                                       int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         T* minval_buf = (T*)valbuf.ptr(0);
         T* maxval_buf = (T*)valbuf.ptr(1);
@@ -1077,23 +1077,23 @@ namespace cv { namespace gpu { namespace mathfunc
 
     __device__ unsigned int blocks_finished = 0;
 
-    void estimate_thread_cfg(dim3& threads, dim3& grid)
+    void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
     {
-        threads = dim3(64, 4);
-        grid = dim3(6, 5);
+        threads = dim3(32, 8);
+        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
     }
 
 
-    void get_buf_size_required(int& cols, int& rows)
+    void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        cols = grid.x * grid.y * sizeof(int);
-        rows = 1;
+        estimate_thread_cfg(cols, rows, threads, grid);
+        bufcols = grid.x * grid.y * sizeof(int);
+        bufrows = 1;
     }
 
 
-    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
     {        
         int twidth = divUp(divUp(cols, grid.x), threads.x);
         int theight = divUp(divUp(rows, grid.y), threads.y);
@@ -1182,8 +1182,8 @@ namespace cv { namespace gpu { namespace mathfunc
     int count_non_zero_caller(const DevMem2D src, PtrStep buf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         unsigned int* count_buf = (unsigned int*)buf.ptr(0);
 
@@ -1226,8 +1226,8 @@ namespace cv { namespace gpu { namespace mathfunc
     int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
     {
         dim3 threads, grid;
-        estimate_thread_cfg(threads, grid);
-        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        estimate_thread_cfg(src.cols, src.rows, threads, grid);
+        set_kernel_consts(src.cols, src.rows, threads, grid);
 
         unsigned int* count_buf = (unsigned int*)buf.ptr(0);