used new device layer for cv::gpu::minMaxLoc
This commit is contained in:
parent
e1aa2fd06c
commit
020624c481
@ -40,197 +40,88 @@
|
|||||||
//
|
//
|
||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
#if !defined CUDA_DISABLER
|
#include "opencv2/opencv_modules.hpp"
|
||||||
|
|
||||||
#include "opencv2/core/cuda/common.hpp"
|
#ifndef HAVE_OPENCV_CUDEV
|
||||||
#include "opencv2/core/cuda/vec_traits.hpp"
|
|
||||||
#include "opencv2/core/cuda/vec_math.hpp"
|
|
||||||
#include "opencv2/core/cuda/functional.hpp"
|
|
||||||
#include "opencv2/core/cuda/reduce.hpp"
|
|
||||||
#include "opencv2/core/cuda/emulation.hpp"
|
|
||||||
#include "opencv2/core/cuda/limits.hpp"
|
|
||||||
#include "opencv2/core/cuda/utility.hpp"
|
|
||||||
|
|
||||||
using namespace cv::cuda;
|
#error "opencv_cudev is required"
|
||||||
using namespace cv::cuda::device;
|
|
||||||
|
|
||||||
namespace minMaxLoc
|
#else
|
||||||
|
|
||||||
|
#include "opencv2/cudaarithm.hpp"
|
||||||
|
#include "opencv2/cudev.hpp"
|
||||||
|
|
||||||
|
using namespace cv::cudev;
|
||||||
|
|
||||||
|
namespace
|
||||||
{
|
{
|
||||||
// To avoid shared bank conflicts we convert each value into value of
|
|
||||||
// appropriate type (32 bits minimum)
|
|
||||||
template <typename T> struct MinMaxTypeTraits;
|
|
||||||
template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
|
|
||||||
template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
|
|
||||||
template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
|
|
||||||
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
|
|
||||||
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
|
|
||||||
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
|
|
||||||
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
|
|
||||||
|
|
||||||
template <int BLOCK_SIZE, typename T, class Mask>
|
|
||||||
__global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
|
|
||||||
{
|
|
||||||
typedef typename MinMaxTypeTraits<T>::best_type work_type;
|
|
||||||
|
|
||||||
__shared__ work_type sminval[BLOCK_SIZE];
|
|
||||||
__shared__ work_type smaxval[BLOCK_SIZE];
|
|
||||||
__shared__ unsigned int sminloc[BLOCK_SIZE];
|
|
||||||
__shared__ unsigned int smaxloc[BLOCK_SIZE];
|
|
||||||
|
|
||||||
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
|
|
||||||
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
|
|
||||||
|
|
||||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
|
||||||
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
|
|
||||||
|
|
||||||
work_type mymin = numeric_limits<work_type>::max();
|
|
||||||
work_type mymax = -numeric_limits<work_type>::max();
|
|
||||||
unsigned int myminloc = 0;
|
|
||||||
unsigned int mymaxloc = 0;
|
|
||||||
|
|
||||||
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
|
|
||||||
{
|
|
||||||
const T* ptr = src.ptr(y);
|
|
||||||
|
|
||||||
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
|
|
||||||
{
|
|
||||||
if (mask(y, x))
|
|
||||||
{
|
|
||||||
const work_type srcVal = ptr[x];
|
|
||||||
|
|
||||||
if (srcVal < mymin)
|
|
||||||
{
|
|
||||||
mymin = srcVal;
|
|
||||||
myminloc = y * src.cols + x;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (srcVal > mymax)
|
|
||||||
{
|
|
||||||
mymax = srcVal;
|
|
||||||
mymaxloc = y * src.cols + x;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
|
|
||||||
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
|
|
||||||
tid,
|
|
||||||
thrust::make_tuple(less<work_type>(), greater<work_type>()));
|
|
||||||
|
|
||||||
if (tid == 0)
|
|
||||||
{
|
|
||||||
minval[bid] = (T) mymin;
|
|
||||||
maxval[bid] = (T) mymax;
|
|
||||||
minloc[bid] = myminloc;
|
|
||||||
maxloc[bid] = mymaxloc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template <int BLOCK_SIZE, typename T>
|
|
||||||
__global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
|
|
||||||
{
|
|
||||||
typedef typename MinMaxTypeTraits<T>::best_type work_type;
|
|
||||||
|
|
||||||
__shared__ work_type sminval[BLOCK_SIZE];
|
|
||||||
__shared__ work_type smaxval[BLOCK_SIZE];
|
|
||||||
__shared__ unsigned int sminloc[BLOCK_SIZE];
|
|
||||||
__shared__ unsigned int smaxloc[BLOCK_SIZE];
|
|
||||||
|
|
||||||
unsigned int idx = ::min(threadIdx.x, count - 1);
|
|
||||||
|
|
||||||
work_type mymin = minval[idx];
|
|
||||||
work_type mymax = maxval[idx];
|
|
||||||
unsigned int myminloc = minloc[idx];
|
|
||||||
unsigned int mymaxloc = maxloc[idx];
|
|
||||||
|
|
||||||
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
|
|
||||||
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
|
|
||||||
threadIdx.x,
|
|
||||||
thrust::make_tuple(less<work_type>(), greater<work_type>()));
|
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
|
||||||
{
|
|
||||||
minval[0] = (T) mymin;
|
|
||||||
maxval[0] = (T) mymax;
|
|
||||||
minloc[0] = myminloc;
|
|
||||||
maxloc[0] = mymaxloc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int threads_x = 32;
|
|
||||||
const int threads_y = 8;
|
|
||||||
|
|
||||||
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
|
|
||||||
{
|
|
||||||
block = dim3(threads_x, threads_y);
|
|
||||||
|
|
||||||
grid = dim3(divUp(cols, block.x * block.y),
|
|
||||||
divUp(rows, block.y * block.x));
|
|
||||||
|
|
||||||
grid.x = ::min(grid.x, block.x);
|
|
||||||
grid.y = ::min(grid.y, block.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
|
|
||||||
{
|
|
||||||
dim3 block, grid;
|
|
||||||
getLaunchCfg(cols, rows, block, grid);
|
|
||||||
|
|
||||||
// For values
|
|
||||||
b1cols = (int)(grid.x * grid.y * elem_size);
|
|
||||||
b1rows = 2;
|
|
||||||
|
|
||||||
// For locations
|
|
||||||
b2cols = grid.x * grid.y * sizeof(int);
|
|
||||||
b2rows = 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
|
void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc)
|
||||||
{
|
{
|
||||||
dim3 block, grid;
|
typedef typename SelectIf<
|
||||||
getLaunchCfg(src.cols, src.rows, block, grid);
|
TypesEquals<T, double>::value,
|
||||||
|
double,
|
||||||
|
typename SelectIf<TypesEquals<T, float>::value, float, int>::type
|
||||||
|
>::type work_type;
|
||||||
|
|
||||||
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
|
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
|
||||||
const int theight = divUp(divUp(src.rows, grid.y), block.y);
|
GpuMat_<work_type>& valBuf = (GpuMat_<work_type>&) _valBuf;
|
||||||
|
GpuMat_<int>& locBuf = (GpuMat_<int>&) _locBuf;
|
||||||
|
|
||||||
T* minval_buf = (T*) valbuf.ptr(0);
|
if (mask.empty())
|
||||||
T* maxval_buf = (T*) valbuf.ptr(1);
|
gridMinMaxLoc(src, valBuf, locBuf);
|
||||||
unsigned int* minloc_buf = locbuf.ptr(0);
|
|
||||||
unsigned int* maxloc_buf = locbuf.ptr(1);
|
|
||||||
|
|
||||||
if (mask.data)
|
|
||||||
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
|
|
||||||
else
|
else
|
||||||
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
|
gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask));
|
||||||
|
|
||||||
cudaSafeCall( cudaGetLastError() );
|
cv::Mat_<work_type> h_valBuf;
|
||||||
|
cv::Mat_<int> h_locBuf;
|
||||||
|
|
||||||
kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
|
valBuf.download(h_valBuf);
|
||||||
cudaSafeCall( cudaGetLastError() );
|
locBuf.download(h_locBuf);
|
||||||
|
|
||||||
cudaSafeCall( cudaDeviceSynchronize() );
|
if (minVal)
|
||||||
|
*minVal = h_valBuf(0, 0);
|
||||||
|
|
||||||
T minval_, maxval_;
|
if (maxVal)
|
||||||
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
*maxVal = h_valBuf(1, 0);
|
||||||
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
|
|
||||||
*minval = minval_;
|
|
||||||
*maxval = maxval_;
|
|
||||||
|
|
||||||
unsigned int minloc_, maxloc_;
|
if (minLoc)
|
||||||
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
{
|
||||||
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
const int idx = h_locBuf(0, 0);
|
||||||
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
|
*minLoc = cv::Point(idx % src.cols, idx / src.cols);
|
||||||
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
|
}
|
||||||
|
|
||||||
|
if (maxLoc)
|
||||||
|
{
|
||||||
|
const int idx = h_locBuf(1, 0);
|
||||||
|
*maxLoc = cv::Point(idx % src.cols, idx / src.cols);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CUDA_DISABLER
|
void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
|
||||||
|
{
|
||||||
|
typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc);
|
||||||
|
static const func_t funcs[] =
|
||||||
|
{
|
||||||
|
minMaxLocImpl<uchar>,
|
||||||
|
minMaxLocImpl<schar>,
|
||||||
|
minMaxLocImpl<ushort>,
|
||||||
|
minMaxLocImpl<short>,
|
||||||
|
minMaxLocImpl<int>,
|
||||||
|
minMaxLocImpl<float>,
|
||||||
|
minMaxLocImpl<double>
|
||||||
|
};
|
||||||
|
|
||||||
|
GpuMat src = _src.getGpuMat();
|
||||||
|
GpuMat mask = _mask.getGpuMat();
|
||||||
|
|
||||||
|
CV_Assert( src.channels() == 1 );
|
||||||
|
CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
|
||||||
|
|
||||||
|
const func_t func = funcs[src.depth()];
|
||||||
|
|
||||||
|
func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@ -186,56 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
|
|||||||
return retVal;
|
return retVal;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
// minMaxLoc
|
|
||||||
|
|
||||||
namespace minMaxLoc
|
|
||||||
{
|
|
||||||
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
|
|
||||||
InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
|
|
||||||
{
|
|
||||||
GpuMat src = _src.getGpuMat();
|
|
||||||
GpuMat mask = _mask.getGpuMat();
|
|
||||||
|
|
||||||
typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
|
|
||||||
static const func_t funcs[] =
|
|
||||||
{
|
|
||||||
::minMaxLoc::run<uchar>,
|
|
||||||
::minMaxLoc::run<schar>,
|
|
||||||
::minMaxLoc::run<ushort>,
|
|
||||||
::minMaxLoc::run<short>,
|
|
||||||
::minMaxLoc::run<int>,
|
|
||||||
::minMaxLoc::run<float>,
|
|
||||||
::minMaxLoc::run<double>
|
|
||||||
};
|
|
||||||
|
|
||||||
CV_Assert( src.channels() == 1 );
|
|
||||||
CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
|
|
||||||
|
|
||||||
if (src.depth() == CV_64F)
|
|
||||||
{
|
|
||||||
if (!deviceSupports(NATIVE_DOUBLE))
|
|
||||||
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
|
|
||||||
}
|
|
||||||
|
|
||||||
Size valbuf_size, locbuf_size;
|
|
||||||
::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
|
|
||||||
ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
|
|
||||||
ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
|
|
||||||
|
|
||||||
const func_t func = funcs[src.depth()];
|
|
||||||
|
|
||||||
double temp1, temp2;
|
|
||||||
Point temp3, temp4;
|
|
||||||
func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
// countNonZero
|
// countNonZero
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user