From 7928cec6704919fbc1280cec2d24e2ac3342b07e Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 15 Aug 2012 13:18:35 +0400 Subject: [PATCH] added linesAccumGlobal kernel --- modules/core/include/opencv2/core/gpumat.hpp | 3 + modules/core/src/gpumat.cpp | 2 +- modules/gpu/src/cuda/hough.cu | 116 ++++++++++++------ modules/gpu/src/hough.cpp | 18 ++- .../gpu/src/opencv2/gpu/device/emulation.hpp | 8 +- 5 files changed, 98 insertions(+), 49 deletions(-) diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index ef86c5a20..989335925 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -112,6 +112,8 @@ namespace cv { namespace gpu int multiProcessorCount() const { return multi_processor_count_; } + size_t sharedMemPerBlock() const { return sharedMemPerBlock_; } + size_t freeMemory() const; size_t totalMemory() const; @@ -133,6 +135,7 @@ namespace cv { namespace gpu int multi_processor_count_; int majorVersion_; int minorVersion_; + size_t sharedMemPerBlock_; }; CV_EXPORTS void printCudaDeviceInfo(int device); diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 25a3e7699..c901bf492 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -42,7 +42,6 @@ #include "precomp.hpp" #include "opencv2/core/gpumat.hpp" - #include #ifdef HAVE_CUDA @@ -301,6 +300,7 @@ void cv::gpu::DeviceInfo::query() multi_processor_count_ = prop.multiProcessorCount; majorVersion_ = prop.major; minorVersion_ = prop.minor; + sharedMemPerBlock_ = prop.sharedMemPerBlock; } void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) const diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu index 34450cd87..d5f7d216c 100644 --- a/modules/gpu/src/cuda/hough.cu +++ b/modules/gpu/src/cuda/hough.cu @@ -48,15 +48,18 @@ namespace cv { namespace gpu { namespace device { namespace hough { - __device__ unsigned int g_counter; + __device__ int g_counter; + + //////////////////////////////////////////////////////////////////////// + // buildPointList const int PIXELS_PER_THREAD = 16; __global__ void buildPointList(const DevMem2Db src, unsigned int* list) { - __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; - __shared__ unsigned int s_qsize[4]; - __shared__ unsigned int s_start[4]; + __shared__ int s_queues[4][32 * PIXELS_PER_THREAD]; + __shared__ int s_qsize[4]; + __shared__ int s_start[4]; const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -75,7 +78,7 @@ namespace cv { namespace gpu { namespace device if (src(y, xx)) { const unsigned int val = (y << 16) | xx; - int qidx = Emulation::smem::atomicInc(&s_qsize[threadIdx.y], (unsigned int)(-1)); + const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1); s_queues[threadIdx.y][qidx] = val; } } @@ -86,15 +89,15 @@ namespace cv { namespace gpu { namespace device if (threadIdx.x == 0 && threadIdx.y == 0) { // find how many items are stored in each list - unsigned int total_size = 0; + int total_size = 0; for (int i = 0; i < blockDim.y; ++i) { s_start[i] = total_size; total_size += s_qsize[i]; } - //calculate the offset in the global list - const unsigned int global_offset = atomicAdd(&g_counter, total_size); + // calculate the offset in the global list + const int global_offset = atomicAdd(&g_counter, total_size); for (int i = 0; i < blockDim.y; ++i) s_start[i] += global_offset; } @@ -102,20 +105,20 @@ namespace cv { namespace gpu { namespace device __syncthreads(); // copy local queues to global queue - const unsigned int qsize = s_qsize[threadIdx.y]; + const int qsize = s_qsize[threadIdx.y]; for(int i = threadIdx.x; i < qsize; i += blockDim.x) { - unsigned int val = s_queues[threadIdx.y][i]; + const unsigned int val = s_queues[threadIdx.y][i]; list[s_start[threadIdx.y] + i] = val; } } - unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list) + int buildPointList_gpu(DevMem2Db src, unsigned int* list) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); const dim3 block(32, 4); const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y)); @@ -127,19 +130,48 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); - unsigned int total_count; - cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); return total_count; } - __global__ void linesAccum(const unsigned int* list, const unsigned int count, PtrStep_ accum, - const float irho, const float theta, const int numrho) - { - extern __shared__ unsigned int smem[]; + //////////////////////////////////////////////////////////////////////// + // linesAccum - for (int i = threadIdx.x; i < numrho; i += blockDim.x) + __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho) + { + const int n = blockIdx.x; + const float ang = n * theta; + + float sin_ang; + float cos_ang; + sincosf(ang, &sin_ang, &cos_ang); + + const float tabSin = sin_ang * irho; + const float tabCos = cos_ang * irho; + + for (int i = threadIdx.x; i < count; i += blockDim.x) + { + const unsigned int qvalue = list[i]; + + const int x = (qvalue & 0x0000FFFF); + const int y = (qvalue >> 16) & 0x0000FFFF; + + int r = __float2int_rn(x * tabCos + y * tabSin); + r += (numrho - 1) / 2; + + ::atomicAdd(accum.ptr(n + 1) + r + 1, 1); + } + } + + __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho) + { + extern __shared__ int smem[]; + + for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x) smem[i] = 0; + __syncthreads(); const int n = blockIdx.x; @@ -154,41 +186,48 @@ namespace cv { namespace gpu { namespace device for (int i = threadIdx.x; i < count; i += blockDim.x) { - // read one element from global memory const unsigned int qvalue = list[i]; - const unsigned int x = (qvalue & 0x0000FFFF); - const unsigned int y = (qvalue >> 16) & 0x0000FFFF; + + const int x = (qvalue & 0x0000FFFF); + const int y = (qvalue >> 16) & 0x0000FFFF; int r = __float2int_rn(x * tabCos + y * tabSin); r += (numrho - 1) / 2; - Emulation::smem::atomicInc(&smem[r], (unsigned int)(-1)); + Emulation::smem::atomicAdd(&smem[r + 1], 1); } + __syncthreads(); for (int i = threadIdx.x; i < numrho; i += blockDim.x) - accum(n + 1, i + 1) = smem[i]; + accum(n + 1, i) = smem[i]; } - void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta) + void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock) { const dim3 block(1024); const dim3 grid(accum.rows - 2); - cudaSafeCall( cudaFuncSetCacheConfig(linesAccum, cudaFuncCachePreferShared) ); + cudaSafeCall( cudaFuncSetCacheConfig(linesAccumShared, cudaFuncCachePreferShared) ); - size_t smem_size = (accum.cols - 2) * sizeof(unsigned int); + size_t smemSize = (accum.cols - 2) * sizeof(int); + + if (smemSize < sharedMemPerBlock - 1000) + linesAccumShared<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); + else + linesAccumGlobal<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); - linesAccum<<>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); } - __global__ void linesGetResult(const DevMem2D_ accum, float2* out, int* voices, const int maxSize, - const float threshold, const float theta, const float rho, const int numrho) + //////////////////////////////////////////////////////////////////////// + // linesGetResult + + __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho) { - __shared__ unsigned int smem[8][32]; + __shared__ int smem[8][32]; int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x; int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y; @@ -211,10 +250,10 @@ namespace cv { namespace gpu { namespace device smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y][threadIdx.x - 1] && smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y][threadIdx.x + 1]) { - float radius = (r - (numrho - 1) * 0.5f) * rho; - float angle = n * theta; + const float radius = (r - (numrho - 1) * 0.5f) * rho; + const float angle = n * theta; - const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); + const int ind = ::atomicAdd(&g_counter, 1); if (ind < maxSize) { out[ind] = make_float2(radius, angle); @@ -223,13 +262,12 @@ namespace cv { namespace gpu { namespace device } } - unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, - float rho, float theta, float threshold, bool doSort) + int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); const dim3 block(32, 8); const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2)); @@ -239,8 +277,8 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); - unsigned int total_count; - cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + int total_count; + cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); total_count = ::min(total_count, maxSize); diff --git a/modules/gpu/src/hough.cpp b/modules/gpu/src/hough.cpp index 94dbe8be3..38e9c0166 100644 --- a/modules/gpu/src/hough.cpp +++ b/modules/gpu/src/hough.cpp @@ -56,9 +56,9 @@ namespace cv { namespace gpu { namespace device { namespace hough { - unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list); - void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_ accum, float rho, float theta); - unsigned int linesGetResult_gpu(DevMem2D_ accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort); + int buildPointList_gpu(DevMem2Db src, unsigned int* list); + void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock); + int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort); } }}} @@ -71,16 +71,21 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf, CV_Assert(src.rows < std::numeric_limits::max()); ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); - unsigned int count = buildPointList_gpu(src, buf.ptr()); + + const int count = buildPointList_gpu(src, buf.ptr()); const int numangle = cvRound(CV_PI / theta); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); + CV_Assert(numangle > 0 && numrho > 0); + ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); accum.setTo(cv::Scalar::all(0)); + cv::gpu::DeviceInfo devInfo; + if (count > 0) - linesAccum_gpu(buf.ptr(), count, accum, rho, theta); + linesAccum_gpu(buf.ptr(), count, accum, rho, theta, devInfo.sharedMemPerBlock()); } void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) @@ -90,7 +95,8 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float CV_Assert(accum.type() == CV_32SC1); ensureSizeIsEnough(2, maxLines, CV_32FC2, lines); - unsigned int count = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, rho, theta, threshold, doSort); + + int count = hough::linesGetResult_gpu(accum, lines.ptr(0), lines.ptr(1), maxLines, rho, theta, threshold, doSort); if (count > 0) lines.cols = count; diff --git a/modules/gpu/src/opencv2/gpu/device/emulation.hpp b/modules/gpu/src/opencv2/gpu/device/emulation.hpp index e116c50ad..1a6f5794c 100644 --- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp +++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp @@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device } template - static __device__ __forceinline__ void atomicAdd(T* address, T val) + static __device__ __forceinline__ T atomicAdd(T* address, T val) { #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) T count; @@ -110,8 +110,10 @@ namespace cv { namespace gpu { namespace device count = tag | (count + val); *address = count; } while (*address != count); + + return (count & TAG_MASK) - val; #else - ::atomicAdd(address, val); + return ::atomicAdd(address, val); #endif } @@ -134,4 +136,4 @@ namespace cv { namespace gpu { namespace device }; }}} // namespace cv { namespace gpu { namespace device -#endif /* OPENCV_GPU_EMULATION_HPP_ */ \ No newline at end of file +#endif /* OPENCV_GPU_EMULATION_HPP_ */