added first version of gpu::countNonZero for all data types, it doesn't support compute capability 1.0 yet, also fixed some little bugs
This commit is contained in:
parent
e470246ab5
commit
7e2cc1be1b
@ -434,6 +434,11 @@ namespace cv
|
||||
CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
|
||||
GpuMat& valbuf, GpuMat& locbuf);
|
||||
|
||||
//! counts non-zero array elements
|
||||
CV_EXPORTS int countNonZero(const GpuMat& src);
|
||||
|
||||
//! counts non-zero array elements
|
||||
CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
|
||||
|
||||
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
|
||||
//! destination array will have the depth type as lut and the same channels number as source
|
||||
|
@ -69,6 +69,8 @@ void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); }
|
||||
void cv::gpu::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); }
|
||||
void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||
int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
|
||||
int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
|
||||
void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||
@ -527,7 +529,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
|
||||
int major, minor;
|
||||
getComputeCapability(getDevice(), major, minor);
|
||||
|
||||
if (major >= 1 && minor >= 1)
|
||||
if (major > 1 || (major == 1 && minor >= 1))
|
||||
{
|
||||
switch (src_.type())
|
||||
{
|
||||
@ -538,7 +540,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
|
||||
case CV_32S: min_max_caller<int>(src_, minVal, maxVal, buf); break;
|
||||
case CV_32F: min_max_caller<float>(src_, minVal, maxVal, buf); break;
|
||||
case CV_64F: min_max_caller<double>(src_, minVal, maxVal, buf); break;
|
||||
default: CV_Error(CV_StsBadArg, "Unsupported type");
|
||||
default: CV_Error(CV_StsBadArg, "minMax: unsupported type");
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -551,7 +553,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
|
||||
case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, buf); break;
|
||||
case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, buf); break;
|
||||
case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, buf); break;
|
||||
default: CV_Error(CV_StsBadArg, "Unsupported type");
|
||||
default: CV_Error(CV_StsBadArg, "minMax: unsupported type");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -601,7 +603,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
|
||||
int major, minor;
|
||||
getComputeCapability(getDevice(), major, minor);
|
||||
|
||||
if (major >= 1 && minor >= 1)
|
||||
if (major > 1 || (major == 1 && minor >= 1))
|
||||
{
|
||||
switch (src.type())
|
||||
{
|
||||
@ -612,7 +614,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
|
||||
case CV_32S: min_max_loc_caller<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
|
||||
case CV_32F: min_max_loc_caller<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
|
||||
case CV_64F: min_max_loc_caller<double>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
|
||||
default: CV_Error(CV_StsBadArg, "Unsupported type");
|
||||
default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -625,7 +627,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
|
||||
case CV_16S: min_max_loc_caller_2steps<signed short>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
|
||||
case CV_32S: min_max_loc_caller_2steps<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
|
||||
case CV_32F: min_max_loc_caller_2steps<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
|
||||
default: CV_Error(CV_StsBadArg, "Unsupported type");
|
||||
default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
|
||||
}
|
||||
}
|
||||
|
||||
@ -633,6 +635,51 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
|
||||
if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Count non zero
|
||||
|
||||
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
|
||||
|
||||
void get_buf_size_required(int& cols, int& rows);
|
||||
|
||||
template <typename T>
|
||||
int count_non_zero_caller(const DevMem2D src, PtrStep buf);
|
||||
|
||||
template <typename T>
|
||||
int count_non_zero_caller_2steps(const DevMem2D src, PtrStep buf);
|
||||
|
||||
}}}}
|
||||
|
||||
int cv::gpu::countNonZero(const GpuMat& src)
|
||||
{
|
||||
GpuMat buf;
|
||||
return countNonZero(src, buf);
|
||||
}
|
||||
|
||||
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
|
||||
{
|
||||
using namespace mathfunc::countnonzero;
|
||||
CV_Assert(src.channels() == 1);
|
||||
|
||||
Size buf_size;
|
||||
get_buf_size_required(buf_size.width, buf_size.height);
|
||||
buf.create(buf_size, CV_8U);
|
||||
|
||||
switch (src.type())
|
||||
{
|
||||
case CV_8U: return count_non_zero_caller<unsigned char>(src, buf);
|
||||
case CV_8S: return count_non_zero_caller<signed char>(src, buf);
|
||||
case CV_16U: return count_non_zero_caller<unsigned short>(src, buf);
|
||||
case CV_16S: return count_non_zero_caller<signed short>(src, buf);
|
||||
case CV_32S: return count_non_zero_caller<int>(src, buf);
|
||||
case CV_32F: return count_non_zero_caller<float>(src, buf);
|
||||
case CV_64F: return count_non_zero_caller<double>(src, buf);
|
||||
}
|
||||
|
||||
CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
|
||||
return 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// LUT
|
||||
|
||||
|
@ -615,6 +615,8 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
} // namespace minmax
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// minMaxLoc
|
||||
|
||||
namespace minmaxloc {
|
||||
|
||||
@ -868,4 +870,126 @@ namespace cv { namespace gpu { namespace mathfunc
|
||||
|
||||
} // namespace minmaxloc
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// countNonZero
|
||||
|
||||
namespace countnonzero
|
||||
{
|
||||
|
||||
__constant__ int ctwidth;
|
||||
__constant__ int ctheight;
|
||||
|
||||
static const unsigned int czero = 0;
|
||||
|
||||
__device__ unsigned int blocks_finished;
|
||||
|
||||
void estimate_thread_cfg(dim3& threads, dim3& grid)
|
||||
{
|
||||
threads = dim3(64, 4);
|
||||
grid = dim3(6, 5);
|
||||
}
|
||||
|
||||
|
||||
void get_buf_size_required(int& cols, int& rows)
|
||||
{
|
||||
dim3 threads, grid;
|
||||
estimate_thread_cfg(threads, grid);
|
||||
cols = grid.x * grid.y * sizeof(int);
|
||||
rows = 1;
|
||||
}
|
||||
|
||||
|
||||
void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
|
||||
{
|
||||
int twidth = divUp(divUp(cols, grid.x), threads.x);
|
||||
int theight = divUp(divUp(rows, grid.y), threads.y);
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
|
||||
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
|
||||
}
|
||||
|
||||
|
||||
template <int nthreads, typename T>
|
||||
__global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)
|
||||
{
|
||||
__shared__ unsigned int scount[nthreads];
|
||||
|
||||
unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
|
||||
unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
|
||||
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
unsigned int cnt = 0;
|
||||
for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
|
||||
{
|
||||
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
|
||||
for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
|
||||
cnt += ptr[x0 + x * blockDim.x] != 0;
|
||||
}
|
||||
|
||||
scount[tid] = cnt;
|
||||
__syncthreads();
|
||||
|
||||
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
|
||||
{
|
||||
if (tid < step) scount[tid] += scount[tid + step];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__shared__ bool is_last;
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
|
||||
__threadfence();
|
||||
|
||||
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
|
||||
is_last = ticket == gridDim.x * gridDim.y - 1;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (is_last)
|
||||
{
|
||||
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
|
||||
|
||||
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
|
||||
{
|
||||
if (tid < step) scount[tid] += scount[tid + step];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (tid == 0) count[0] = scount[0];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
|
||||
{
|
||||
dim3 threads, grid;
|
||||
estimate_thread_cfg(threads, grid);
|
||||
estimate_kernel_consts(src.cols, src.rows, threads, grid);
|
||||
|
||||
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
|
||||
|
||||
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
|
||||
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
|
||||
cudaSafeCall(cudaThreadSynchronize());
|
||||
|
||||
unsigned int count;
|
||||
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
template int count_non_zero_caller<unsigned char>(const DevMem2D, PtrStep);
|
||||
template int count_non_zero_caller<signed char>(const DevMem2D, PtrStep);
|
||||
template int count_non_zero_caller<unsigned short>(const DevMem2D, PtrStep);
|
||||
template int count_non_zero_caller<signed short>(const DevMem2D, PtrStep);
|
||||
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
|
||||
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
|
||||
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
|
||||
|
||||
} // namespace countnonzero
|
||||
|
||||
}}}
|
||||
|
@ -689,9 +689,7 @@ struct CV_GpuMinMaxTest: public CvTest
|
||||
for (int cn = 1; cn <= 4; ++cn)
|
||||
for (int depth = CV_8U; depth <= depth_end; ++depth)
|
||||
{
|
||||
int rows = 1, cols = 3;
|
||||
test(rows, cols, cn, depth);
|
||||
for (int i = 0; i < 4; ++i)
|
||||
for (int i = 0; i < 1; ++i)
|
||||
{
|
||||
int rows = 1 + rand() % 1000;
|
||||
int cols = 1 + rand() % 1000;
|
||||
@ -821,6 +819,59 @@ struct CV_GpuMinMaxLocTest: public CvTest
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Count non zero
|
||||
struct CV_GpuCountNonZeroTest: CvTest
|
||||
{
|
||||
CV_GpuCountNonZeroTest(): CvTest("GPU-CountNonZeroTest", "countNonZero") {}
|
||||
|
||||
void run(int)
|
||||
{
|
||||
srand(0);
|
||||
int depth_end;
|
||||
int major, minor;
|
||||
cv::gpu::getComputeCapability(getDevice(), major, minor);
|
||||
|
||||
if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F;
|
||||
for (int depth = CV_8U; depth <= depth_end; ++depth)
|
||||
{
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
int rows = 1 + rand() % 1000;
|
||||
int cols = 1 + rand() % 1000;
|
||||
test(rows, cols, depth);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void test(int rows, int cols, int depth)
|
||||
{
|
||||
cv::Mat src(rows, cols, depth);
|
||||
cv::RNG rng;
|
||||
if (depth == 5)
|
||||
rng.fill(src, RNG::UNIFORM, Scalar(-1000.f), Scalar(1000.f));
|
||||
else if (depth == 6)
|
||||
rng.fill(src, RNG::UNIFORM, Scalar(-1000.), Scalar(1000.));
|
||||
else
|
||||
for (int i = 0; i < src.rows; ++i)
|
||||
{
|
||||
Mat row(1, src.cols * src.elemSize(), CV_8U, src.ptr(i));
|
||||
rng.fill(row, RNG::UNIFORM, Scalar(0), Scalar(255));
|
||||
}
|
||||
|
||||
int n_gold = cv::countNonZero(src);
|
||||
int n = cv::gpu::countNonZero(cv::gpu::GpuMat(src));
|
||||
|
||||
if (n != n_gold)
|
||||
{
|
||||
ts->printf(CvTS::CONSOLE, "%d %d %d %d %d\n", n, n_gold, depth, cols, rows);
|
||||
n_gold = cv::countNonZero(src);
|
||||
}
|
||||
|
||||
CHECK(n == n_gold, CvTS::FAIL_INVALID_OUTPUT);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////// tests registration /////////////////////////////////////
|
||||
@ -850,3 +901,4 @@ CV_GpuNppImageCartToPolarTest CV_GpuNppImageCartToPolar_test;
|
||||
CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test;
|
||||
CV_GpuMinMaxTest CV_GpuMinMaxTest_test;
|
||||
CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test;
|
||||
CV_GpuCountNonZeroTest CV_CountNonZero_test;
|
||||
|
Loading…
x
Reference in New Issue
Block a user