added linesAccumGlobal kernel

This commit is contained in:
Vladislav Vinogradov 2012-08-15 13:18:35 +04:00
parent 7ae94c571c
commit 7928cec670
5 changed files with 98 additions and 49 deletions

View File

@ -112,6 +112,8 @@ namespace cv { namespace gpu
int multiProcessorCount() const { return multi_processor_count_; } int multiProcessorCount() const { return multi_processor_count_; }
size_t sharedMemPerBlock() const { return sharedMemPerBlock_; }
size_t freeMemory() const; size_t freeMemory() const;
size_t totalMemory() const; size_t totalMemory() const;
@ -133,6 +135,7 @@ namespace cv { namespace gpu
int multi_processor_count_; int multi_processor_count_;
int majorVersion_; int majorVersion_;
int minorVersion_; int minorVersion_;
size_t sharedMemPerBlock_;
}; };
CV_EXPORTS void printCudaDeviceInfo(int device); CV_EXPORTS void printCudaDeviceInfo(int device);

View File

@ -42,7 +42,6 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "opencv2/core/gpumat.hpp" #include "opencv2/core/gpumat.hpp"
#include <iostream> #include <iostream>
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
@ -301,6 +300,7 @@ void cv::gpu::DeviceInfo::query()
multi_processor_count_ = prop.multiProcessorCount; multi_processor_count_ = prop.multiProcessorCount;
majorVersion_ = prop.major; majorVersion_ = prop.major;
minorVersion_ = prop.minor; minorVersion_ = prop.minor;
sharedMemPerBlock_ = prop.sharedMemPerBlock;
} }
void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) const void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory) const

View File

@ -48,15 +48,18 @@ namespace cv { namespace gpu { namespace device
{ {
namespace hough namespace hough
{ {
__device__ unsigned int g_counter; __device__ int g_counter;
////////////////////////////////////////////////////////////////////////
// buildPointList
const int PIXELS_PER_THREAD = 16; const int PIXELS_PER_THREAD = 16;
__global__ void buildPointList(const DevMem2Db src, unsigned int* list) __global__ void buildPointList(const DevMem2Db src, unsigned int* list)
{ {
__shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; __shared__ int s_queues[4][32 * PIXELS_PER_THREAD];
__shared__ unsigned int s_qsize[4]; __shared__ int s_qsize[4];
__shared__ unsigned int s_start[4]; __shared__ int s_start[4];
const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x; const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -75,7 +78,7 @@ namespace cv { namespace gpu { namespace device
if (src(y, xx)) if (src(y, xx))
{ {
const unsigned int val = (y << 16) | xx; const unsigned int val = (y << 16) | xx;
int qidx = Emulation::smem::atomicInc(&s_qsize[threadIdx.y], (unsigned int)(-1)); const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
s_queues[threadIdx.y][qidx] = val; s_queues[threadIdx.y][qidx] = val;
} }
} }
@ -86,15 +89,15 @@ namespace cv { namespace gpu { namespace device
if (threadIdx.x == 0 && threadIdx.y == 0) if (threadIdx.x == 0 && threadIdx.y == 0)
{ {
// find how many items are stored in each list // find how many items are stored in each list
unsigned int total_size = 0; int total_size = 0;
for (int i = 0; i < blockDim.y; ++i) for (int i = 0; i < blockDim.y; ++i)
{ {
s_start[i] = total_size; s_start[i] = total_size;
total_size += s_qsize[i]; total_size += s_qsize[i];
} }
//calculate the offset in the global list // calculate the offset in the global list
const unsigned int global_offset = atomicAdd(&g_counter, total_size); const int global_offset = atomicAdd(&g_counter, total_size);
for (int i = 0; i < blockDim.y; ++i) for (int i = 0; i < blockDim.y; ++i)
s_start[i] += global_offset; s_start[i] += global_offset;
} }
@ -102,20 +105,20 @@ namespace cv { namespace gpu { namespace device
__syncthreads(); __syncthreads();
// copy local queues to global queue // copy local queues to global queue
const unsigned int qsize = s_qsize[threadIdx.y]; const int qsize = s_qsize[threadIdx.y];
for(int i = threadIdx.x; i < qsize; i += blockDim.x) for(int i = threadIdx.x; i < qsize; i += blockDim.x)
{ {
unsigned int val = s_queues[threadIdx.y][i]; const unsigned int val = s_queues[threadIdx.y][i];
list[s_start[threadIdx.y] + i] = val; list[s_start[threadIdx.y] + i] = val;
} }
} }
unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list) int buildPointList_gpu(DevMem2Db src, unsigned int* list)
{ {
void* counter_ptr; void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
const dim3 block(32, 4); const dim3 block(32, 4);
const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y)); const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
@ -127,19 +130,48 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
unsigned int total_count; int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
return total_count; return total_count;
} }
__global__ void linesAccum(const unsigned int* list, const unsigned int count, PtrStep_<unsigned int> accum, ////////////////////////////////////////////////////////////////////////
const float irho, const float theta, const int numrho) // linesAccum
{
extern __shared__ unsigned int smem[];
for (int i = threadIdx.x; i < numrho; i += blockDim.x) __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
{
const int n = blockIdx.x;
const float ang = n * theta;
float sin_ang;
float cos_ang;
sincosf(ang, &sin_ang, &cos_ang);
const float tabSin = sin_ang * irho;
const float tabCos = cos_ang * irho;
for (int i = threadIdx.x; i < count; i += blockDim.x)
{
const unsigned int qvalue = list[i];
const int x = (qvalue & 0x0000FFFF);
const int y = (qvalue >> 16) & 0x0000FFFF;
int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2;
::atomicAdd(accum.ptr(n + 1) + r + 1, 1);
}
}
__global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
{
extern __shared__ int smem[];
for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
smem[i] = 0; smem[i] = 0;
__syncthreads(); __syncthreads();
const int n = blockIdx.x; const int n = blockIdx.x;
@ -154,41 +186,48 @@ namespace cv { namespace gpu { namespace device
for (int i = threadIdx.x; i < count; i += blockDim.x) for (int i = threadIdx.x; i < count; i += blockDim.x)
{ {
// read one element from global memory
const unsigned int qvalue = list[i]; const unsigned int qvalue = list[i];
const unsigned int x = (qvalue & 0x0000FFFF);
const unsigned int y = (qvalue >> 16) & 0x0000FFFF; const int x = (qvalue & 0x0000FFFF);
const int y = (qvalue >> 16) & 0x0000FFFF;
int r = __float2int_rn(x * tabCos + y * tabSin); int r = __float2int_rn(x * tabCos + y * tabSin);
r += (numrho - 1) / 2; r += (numrho - 1) / 2;
Emulation::smem::atomicInc(&smem[r], (unsigned int)(-1)); Emulation::smem::atomicAdd(&smem[r + 1], 1);
} }
__syncthreads(); __syncthreads();
for (int i = threadIdx.x; i < numrho; i += blockDim.x) for (int i = threadIdx.x; i < numrho; i += blockDim.x)
accum(n + 1, i + 1) = smem[i]; accum(n + 1, i) = smem[i];
} }
void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_<unsigned int> accum, float rho, float theta) void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock)
{ {
const dim3 block(1024); const dim3 block(1024);
const dim3 grid(accum.rows - 2); const dim3 grid(accum.rows - 2);
cudaSafeCall( cudaFuncSetCacheConfig(linesAccum, cudaFuncCachePreferShared) ); cudaSafeCall( cudaFuncSetCacheConfig(linesAccumShared, cudaFuncCachePreferShared) );
size_t smem_size = (accum.cols - 2) * sizeof(unsigned int); size_t smemSize = (accum.cols - 2) * sizeof(int);
if (smemSize < sharedMemPerBlock - 1000)
linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
else
linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
linesAccum<<<grid, block, smem_size>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
__global__ void linesGetResult(const DevMem2D_<unsigned int> accum, float2* out, int* voices, const int maxSize, ////////////////////////////////////////////////////////////////////////
const float threshold, const float theta, const float rho, const int numrho) // linesGetResult
__global__ void linesGetResult(const DevMem2Di accum, float2* out, int* voices, const int maxSize, const float threshold, const float theta, const float rho, const int numrho)
{ {
__shared__ unsigned int smem[8][32]; __shared__ int smem[8][32];
int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x; int r = blockIdx.x * (blockDim.x - 2) + threadIdx.x;
int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y; int n = blockIdx.y * (blockDim.y - 2) + threadIdx.y;
@ -211,10 +250,10 @@ namespace cv { namespace gpu { namespace device
smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y][threadIdx.x - 1] && smem[threadIdx.y][threadIdx.x] > smem[threadIdx.y][threadIdx.x - 1] &&
smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y][threadIdx.x + 1]) smem[threadIdx.y][threadIdx.x] >= smem[threadIdx.y][threadIdx.x + 1])
{ {
float radius = (r - (numrho - 1) * 0.5f) * rho; const float radius = (r - (numrho - 1) * 0.5f) * rho;
float angle = n * theta; const float angle = n * theta;
const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); const int ind = ::atomicAdd(&g_counter, 1);
if (ind < maxSize) if (ind < maxSize)
{ {
out[ind] = make_float2(radius, angle); out[ind] = make_float2(radius, angle);
@ -223,13 +262,12 @@ namespace cv { namespace gpu { namespace device
} }
} }
unsigned int linesGetResult_gpu(DevMem2D_<unsigned int> accum, float2* out, int* voices, unsigned int maxSize, int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort)
float rho, float theta, float threshold, bool doSort)
{ {
void* counter_ptr; void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
const dim3 block(32, 8); const dim3 block(32, 8);
const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2)); const dim3 grid(divUp(accum.cols, block.x - 2), divUp(accum.rows, block.y - 2));
@ -239,8 +277,8 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
unsigned int total_count; int total_count;
cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&total_count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
total_count = ::min(total_count, maxSize); total_count = ::min(total_count, maxSize);

View File

@ -56,9 +56,9 @@ namespace cv { namespace gpu { namespace device
{ {
namespace hough namespace hough
{ {
unsigned int buildPointList_gpu(DevMem2Db src, unsigned int* list); int buildPointList_gpu(DevMem2Db src, unsigned int* list);
void linesAccum_gpu(const unsigned int* list, unsigned int count, DevMem2D_<unsigned int> accum, float rho, float theta); void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock);
unsigned int linesGetResult_gpu(DevMem2D_<uint> accum, float2* out, int* voices, unsigned int maxSize, float rho, float theta, float threshold, bool doSort); int linesGetResult_gpu(DevMem2Di accum, float2* out, int* voices, int maxSize, float rho, float theta, float threshold, bool doSort);
} }
}}} }}}
@ -71,16 +71,21 @@ void cv::gpu::HoughLinesTransform(const GpuMat& src, GpuMat& accum, GpuMat& buf,
CV_Assert(src.rows < std::numeric_limits<unsigned short>::max()); CV_Assert(src.rows < std::numeric_limits<unsigned short>::max());
ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf); ensureSizeIsEnough(1, src.size().area(), CV_32SC1, buf);
unsigned int count = buildPointList_gpu(src, buf.ptr<unsigned int>());
const int count = buildPointList_gpu(src, buf.ptr<unsigned int>());
const int numangle = cvRound(CV_PI / theta); const int numangle = cvRound(CV_PI / theta);
const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho); const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho);
CV_Assert(numangle > 0 && numrho > 0);
ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum); ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum);
accum.setTo(cv::Scalar::all(0)); accum.setTo(cv::Scalar::all(0));
cv::gpu::DeviceInfo devInfo;
if (count > 0) if (count > 0)
linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta); linesAccum_gpu(buf.ptr<unsigned int>(), count, accum, rho, theta, devInfo.sharedMemPerBlock());
} }
void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines) void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float theta, int threshold, bool doSort, int maxLines)
@ -90,7 +95,8 @@ void cv::gpu::HoughLinesGet(const GpuMat& accum, GpuMat& lines, float rho, float
CV_Assert(accum.type() == CV_32SC1); CV_Assert(accum.type() == CV_32SC1);
ensureSizeIsEnough(2, maxLines, CV_32FC2, lines); ensureSizeIsEnough(2, maxLines, CV_32FC2, lines);
unsigned int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
int count = hough::linesGetResult_gpu(accum, lines.ptr<float2>(0), lines.ptr<int>(1), maxLines, rho, theta, threshold, doSort);
if (count > 0) if (count > 0)
lines.cols = count; lines.cols = count;

View File

@ -99,7 +99,7 @@ namespace cv { namespace gpu { namespace device
} }
template<typename T> template<typename T>
static __device__ __forceinline__ void atomicAdd(T* address, T val) static __device__ __forceinline__ T atomicAdd(T* address, T val)
{ {
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120) #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
T count; T count;
@ -110,8 +110,10 @@ namespace cv { namespace gpu { namespace device
count = tag | (count + val); count = tag | (count + val);
*address = count; *address = count;
} while (*address != count); } while (*address != count);
return (count & TAG_MASK) - val;
#else #else
::atomicAdd(address, val); return ::atomicAdd(address, val);
#endif #endif
} }
@ -134,4 +136,4 @@ namespace cv { namespace gpu { namespace device
}; };
}}} // namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device
#endif /* OPENCV_GPU_EMULATION_HPP_ */ #endif /* OPENCV_GPU_EMULATION_HPP_ */