From 8c77e5faad037009746a5524f0ac24902d72b9bf Mon Sep 17 00:00:00 2001
From: Andrey Morozov <no@email>
Date: Mon, 2 Aug 2010 13:10:21 +0000
Subject: [PATCH] optimized gpumat::setTo(), get rid of division operation

---
 modules/gpu/src/cuda/matrix_operations.cu | 30 +++++++++++++++++++----
 1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu
index fe730cc7e..e65163e72 100644
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -52,7 +52,7 @@ using namespace cv::gpu::impl;
 
 namespace mat_operators
 {
-    __constant__ double scalar_d[4];
+    __constant__ double scalar_d[256];
 
 
     template <typename T>
@@ -139,7 +139,7 @@ namespace mat_operators
         if ((x < cols * channels ) && (y < rows))
         {
             size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-            mat[idx] = scalar_d[ x % channels ];
+            mat[idx] = scalar_d[ threadIdx.x];
         }
     }
 
@@ -153,7 +153,7 @@ namespace mat_operators
             if (mask[y * step_mask + x / channels] != 0)
             {
                 size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-                mat[idx] = scalar_d[ x % channels ];
+                mat[idx] = scalar_d[ threadIdx.x ];
             }
     }
 
@@ -354,7 +354,17 @@ namespace cv
 
             extern "C" void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
             {
-                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4));
+                double * scalar_vec = new double [256];
+                int index = 0;
+                for (int i = 0; i < 256; i++)
+                {
+                    scalar_vec[i] = scalar[index];
+                    index++;
+                    if (index == channels) index = 0;
+                }
+                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar_vec, sizeof(double) * 256));
+
+                delete [] scalar_vec;
 
                 static SetToFunc_without_mask tab[8] =
                 {
@@ -379,7 +389,17 @@ namespace cv
 
             extern "C" void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
             {
-                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4));
+                double * scalar_vec = new double [256];
+                int index = 0;
+                for (int i = 0; i < 256; i++)
+                {
+                    scalar_vec[i] = scalar[index];
+                    index++;
+                    if (index == channels) index = 0;
+                }
+                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar_vec, sizeof(double) * 256));
+
+                delete [] scalar_vec;
 
                 static SetToFunc_with_mask tab[8] =
                 {