added first version of gpu::countNonZero for all data types, it doesn't support compute capability 1.0 yet, also fixed some little bugs

This commit is contained in:
Alexey Spizhevoy
2010-11-26 17:12:48 +00:00
parent e470246ab5
commit 7e2cc1be1b
4 changed files with 237 additions and 9 deletions

View File

@@ -615,6 +615,8 @@ namespace cv { namespace gpu { namespace mathfunc
} // namespace minmax
///////////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minmaxloc {
@@ -868,4 +870,126 @@ namespace cv { namespace gpu { namespace mathfunc
} // namespace minmaxloc
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countnonzero
{
__constant__ int ctwidth;
__constant__ int ctheight;
static const unsigned int czero = 0;
__device__ unsigned int blocks_finished;
void estimate_thread_cfg(dim3& threads, dim3& grid)
{
threads = dim3(64, 4);
grid = dim3(6, 5);
}
void get_buf_size_required(int& cols, int& rows)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
cols = grid.x * grid.y * sizeof(int);
rows = 1;
}
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)
{
__shared__ unsigned int scount[nthreads];
unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int cnt = 0;
for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
cnt += ptr[x0 + x * blockDim.x] != 0;
}
scount[tid] = cnt;
__syncthreads();
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
{
if (tid < step) scount[tid] += scount[tid + step];
__syncthreads();
}
__shared__ bool is_last;
if (tid == 0)
{
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
__threadfence();
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
{
if (tid < step) scount[tid] += scount[tid + step];
__syncthreads();
}
if (tid == 0) count[0] = scount[0];
}
}
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller<unsigned char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<signed char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<unsigned short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<signed short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
} // namespace countnonzero
}}}