From 8c77e5faad037009746a5524f0ac24902d72b9bf Mon Sep 17 00:00:00 2001 From: Andrey Morozov Date: Mon, 2 Aug 2010 13:10:21 +0000 Subject: [PATCH] optimized gpumat::setTo(), get rid of division operation --- modules/gpu/src/cuda/matrix_operations.cu | 30 +++++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/modules/gpu/src/cuda/matrix_operations.cu b/modules/gpu/src/cuda/matrix_operations.cu index fe730cc7e..e65163e72 100644 --- a/modules/gpu/src/cuda/matrix_operations.cu +++ b/modules/gpu/src/cuda/matrix_operations.cu @@ -52,7 +52,7 @@ using namespace cv::gpu::impl; namespace mat_operators { - __constant__ double scalar_d[4]; + __constant__ double scalar_d[256]; template @@ -139,7 +139,7 @@ namespace mat_operators if ((x < cols * channels ) && (y < rows)) { size_t idx = y * ( step >> shift_and_sizeof::shift ) + x; - mat[idx] = scalar_d[ x % channels ]; + mat[idx] = scalar_d[ threadIdx.x]; } } @@ -153,7 +153,7 @@ namespace mat_operators if (mask[y * step_mask + x / channels] != 0) { size_t idx = y * ( step >> shift_and_sizeof::shift ) + x; - mat[idx] = scalar_d[ x % channels ]; + mat[idx] = scalar_d[ threadIdx.x ]; } } @@ -354,7 +354,17 @@ namespace cv extern "C" void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream) { - cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4)); + double * scalar_vec = new double [256]; + int index = 0; + for (int i = 0; i < 256; i++) + { + scalar_vec[i] = scalar[index]; + index++; + if (index == channels) index = 0; + } + cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar_vec, sizeof(double) * 256)); + + delete [] scalar_vec; static SetToFunc_without_mask tab[8] = { @@ -379,7 +389,17 @@ namespace cv extern "C" void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream) { - cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4)); + double * scalar_vec = new double [256]; + int index = 0; + for (int i = 0; i < 256; i++) + { + scalar_vec[i] = scalar[index]; + index++; + if (index == channels) index = 0; + } + cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar_vec, sizeof(double) * 256)); + + delete [] scalar_vec; static SetToFunc_with_mask tab[8] = {