moved GpuMat and DevMem2D to core module, some code refactoring

This commit is contained in:
Vladislav Vinogradov 2011-11-09 13:13:52 +00:00
parent 8a148e39f0
commit fcfa72081e
95 changed files with 18889 additions and 18485 deletions

View File

@ -90,6 +90,10 @@ class Mat;
class SparseMat; class SparseMat;
typedef Mat MatND; typedef Mat MatND;
namespace gpu {
class GpuMat;
}
class CV_EXPORTS MatExpr; class CV_EXPORTS MatExpr;
class CV_EXPORTS MatOp_Base; class CV_EXPORTS MatOp_Base;
class CV_EXPORTS MatArg; class CV_EXPORTS MatArg;
@ -1627,6 +1631,10 @@ public:
template<typename _Tp> explicit Mat(const Point3_<_Tp>& pt, bool copyData=true); template<typename _Tp> explicit Mat(const Point3_<_Tp>& pt, bool copyData=true);
//! builds matrix from comma initializer //! builds matrix from comma initializer
template<typename _Tp> explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer); template<typename _Tp> explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer);
//! download data from GpuMat
explicit Mat(const gpu::GpuMat& m);
//! destructor - calls release() //! destructor - calls release()
~Mat(); ~Mat();
//! assignment operators //! assignment operators

View File

@ -0,0 +1,157 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_CORE_DevMem2D_HPP__
#define __OPENCV_CORE_DevMem2D_HPP__
#ifdef __CUDACC__
#define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#else
#define __CV_GPU_HOST_DEVICE__
#endif
namespace cv
{
namespace gpu
{
// Simple lightweight structures that encapsulates information about an image on device.
// It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
template <bool expr> struct StaticAssert;
template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};
template<typename T> struct DevPtr
{
typedef T elem_type;
typedef int index_type;
enum { elem_size = sizeof(elem_type) };
T* data;
__CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}
__CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
__CV_GPU_HOST_DEVICE__ operator T*() { return data; }
__CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
};
template<typename T> struct PtrSz : public DevPtr<T>
{
__CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
__CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
size_t size;
};
template<typename T> struct PtrStep : public DevPtr<T>
{
__CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
__CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
/** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
size_t step;
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return ( T*)( ( char*)DevPtr<T>::data + y * step); }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
__CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
};
template <typename T> struct PtrStepSz : public PtrStep<T>
{
__CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
__CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
: PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
int cols;
int rows;
};
template <typename T> struct DevMem2D_ : public PtrStepSz<T>
{
DevMem2D_() {}
DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
template <typename U>
explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}
};
template<typename T> struct PtrElemStep_ : public PtrStep<T>
{
PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step)
{
StaticAssert<256 % sizeof(T) == 0>::check();
PtrStep<T>::step /= PtrStep<T>::elem_size;
}
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }
__CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
};
template<typename T> struct PtrStep_ : public PtrStep<T>
{
PtrStep_() {}
PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}
};
typedef DevMem2D_<unsigned char> DevMem2Db;
typedef DevMem2Db DevMem2D;
typedef DevMem2D_<float> DevMem2Df;
typedef DevMem2D_<int> DevMem2Di;
typedef PtrStep<unsigned char> PtrStepb;
typedef PtrStep<float> PtrStepf;
typedef PtrStep<int> PtrStepi;
typedef PtrElemStep_<unsigned char> PtrElemStep;
typedef PtrElemStep_<float> PtrElemStepf;
typedef PtrElemStep_<int> PtrElemStepi;
}
}
#endif /* __OPENCV_GPU_DevMem2D_HPP__ */

View File

@ -0,0 +1,471 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPUMAT_HPP__
#define __OPENCV_GPUMAT_HPP__
#include "opencv2/core/core.hpp"
#include "opencv2/core/devmem2d.hpp"
namespace cv { namespace gpu
{
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
class CV_EXPORTS GpuMat
{
public:
//! default constructor
GpuMat();
//! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
GpuMat(int rows, int cols, int type);
GpuMat(Size size, int type);
//! constucts GpuMatrix and fills it with the specified value _s.
GpuMat(int rows, int cols, int type, Scalar s);
GpuMat(Size size, int type, Scalar s);
//! copy constructor
GpuMat(const GpuMat& m);
//! constructor for GpuMatrix headers pointing to user-allocated data
GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
//! creates a matrix header for a part of the bigger matrix
GpuMat(const GpuMat& m, Range rowRange, Range colRange);
GpuMat(const GpuMat& m, Rect roi);
//! builds GpuMat from Mat. Perfom blocking upload to device.
explicit GpuMat(const Mat& m);
//! destructor - calls release()
~GpuMat();
//! assignment operators
GpuMat& operator = (const GpuMat& m);
//! pefroms blocking upload data to GpuMat.
void upload(const Mat& m);
//! downloads data from device to host memory. Blocking calls.
void download(Mat& m) const;
//! returns a new GpuMatrix header for the specified row
GpuMat row(int y) const;
//! returns a new GpuMatrix header for the specified column
GpuMat col(int x) const;
//! ... for the specified row span
GpuMat rowRange(int startrow, int endrow) const;
GpuMat rowRange(Range r) const;
//! ... for the specified column span
GpuMat colRange(int startcol, int endcol) const;
GpuMat colRange(Range r) const;
//! returns deep copy of the GpuMatrix, i.e. the data is copied
GpuMat clone() const;
//! copies the GpuMatrix content to "m".
// It calls m.create(this->size(), this->type()).
void copyTo(GpuMat& m) const;
//! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.
void copyTo(GpuMat& m, const GpuMat& mask) const;
//! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.
void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;
void assignTo(GpuMat& m, int type=-1) const;
//! sets every GpuMatrix element to s
GpuMat& operator = (Scalar s);
//! sets some of the GpuMatrix elements to s, according to the mask
GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat());
//! creates alternative GpuMatrix header for the same data, with different
// number of channels and/or different number of rows. see cvReshape.
GpuMat reshape(int cn, int rows = 0) const;
//! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.
// previous data is unreferenced if needed.
void create(int rows, int cols, int type);
void create(Size size, int type);
//! decreases reference counter;
// deallocate the data when reference counter reaches 0.
void release();
//! swaps with other smart pointer
void swap(GpuMat& mat);
//! locates GpuMatrix header within a parent GpuMatrix. See below
void locateROI(Size& wholeSize, Point& ofs) const;
//! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.
GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
//! extracts a rectangular sub-GpuMatrix
// (this is a generalized form of row, rowRange etc.)
GpuMat operator()(Range rowRange, Range colRange) const;
GpuMat operator()(Rect roi) const;
//! returns true iff the GpuMatrix data is continuous
// (i.e. when there are no gaps between successive rows).
// similar to CV_IS_GpuMat_CONT(cvGpuMat->type)
bool isContinuous() const;
//! returns element size in bytes,
// similar to CV_ELEM_SIZE(cvMat->type)
size_t elemSize() const;
//! returns the size of element channel in bytes.
size_t elemSize1() const;
//! returns element type, similar to CV_MAT_TYPE(cvMat->type)
int type() const;
//! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
int depth() const;
//! returns element type, similar to CV_MAT_CN(cvMat->type)
int channels() const;
//! returns step/elemSize1()
size_t step1() const;
//! returns GpuMatrix size:
// width == number of columns, height == number of rows
Size size() const;
//! returns true if GpuMatrix data is NULL
bool empty() const;
//! returns pointer to y-th row
uchar* ptr(int y = 0);
const uchar* ptr(int y = 0) const;
//! template version of the above method
template<typename _Tp> _Tp* ptr(int y = 0);
template<typename _Tp> const _Tp* ptr(int y = 0) const;
template <typename _Tp> operator DevMem2D_<_Tp>() const;
template <typename _Tp> operator PtrStep_<_Tp>() const;
/*! includes several bit-fields:
- the magic signature
- continuity flag
- depth
- number of channels
*/
int flags;
//! the number of rows and columns
int rows, cols;
//! a distance between successive rows in bytes; includes the gap if any
size_t step;
//! pointer to the data
uchar* data;
//! pointer to the reference counter;
// when GpuMatrix points to user-allocated data, the pointer is NULL
int* refcount;
//! helper fields used in locateROI and adjustROI
uchar* datastart;
uchar* dataend;
};
//! Creates continuous GPU matrix
CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);
CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);
CV_EXPORTS GpuMat createContinuous(Size size, int type);
//! Ensures that size of the given matrix is not less than (rows, cols) size
//! and matrix type is match specified one too
CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
class CV_EXPORTS GpuFuncTable
{
public:
virtual ~GpuFuncTable() {}
virtual void copy(const Mat& src, GpuMat& dst) const = 0;
virtual void copy(const GpuMat& src, Mat& dst) const = 0;
virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
virtual void free(void* devPtr) const = 0;
};
CV_EXPORTS void setGpuFuncTable(const GpuFuncTable* funcTbl);
////////////////////////////////////////////////////////////////////////
inline GpuMat::GpuMat()
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
}
inline GpuMat::GpuMat(int rows_, int cols_, int type_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (rows_ > 0 && cols_ > 0)
create(rows_, cols_, type_);
}
inline GpuMat::GpuMat(Size size_, int type_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (size_.height > 0 && size_.width > 0)
create(size_.height, size_.width, type_);
}
inline GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (rows_ > 0 && cols_ > 0)
{
create(rows_, cols_, type_);
setTo(s_);
}
}
inline GpuMat::GpuMat(Size size_, int type_, Scalar s_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (size_.height > 0 && size_.width > 0)
{
create(size_.height, size_.width, type_);
setTo(s_);
}
}
inline GpuMat::~GpuMat()
{
release();
}
inline GpuMat GpuMat::clone() const
{
GpuMat m;
copyTo(m);
return m;
}
inline void GpuMat::assignTo(GpuMat& m, int type) const
{
if (type < 0)
m = *this;
else
convertTo(m, type);
}
inline size_t GpuMat::step1() const
{
return step / elemSize1();
}
inline bool GpuMat::empty() const
{
return data == 0;
}
template<typename _Tp> inline _Tp* GpuMat::ptr(int y)
{
return (_Tp*)ptr(y);
}
template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const
{
return (const _Tp*)ptr(y);
}
inline void swap(GpuMat& a, GpuMat& b)
{
a.swap(b);
}
inline GpuMat GpuMat::row(int y) const
{
return GpuMat(*this, Range(y, y+1), Range::all());
}
inline GpuMat GpuMat::col(int x) const
{
return GpuMat(*this, Range::all(), Range(x, x+1));
}
inline GpuMat GpuMat::rowRange(int startrow, int endrow) const
{
return GpuMat(*this, Range(startrow, endrow), Range::all());
}
inline GpuMat GpuMat::rowRange(Range r) const
{
return GpuMat(*this, r, Range::all());
}
inline GpuMat GpuMat::colRange(int startcol, int endcol) const
{
return GpuMat(*this, Range::all(), Range(startcol, endcol));
}
inline GpuMat GpuMat::colRange(Range r) const
{
return GpuMat(*this, Range::all(), r);
}
inline void GpuMat::create(Size size_, int type_)
{
create(size_.height, size_.width, type_);
}
inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const
{
return GpuMat(*this, rowRange, colRange);
}
inline GpuMat GpuMat::operator()(Rect roi) const
{
return GpuMat(*this, roi);
}
inline bool GpuMat::isContinuous() const
{
return (flags & Mat::CONTINUOUS_FLAG) != 0;
}
inline size_t GpuMat::elemSize() const
{
return CV_ELEM_SIZE(flags);
}
inline size_t GpuMat::elemSize1() const
{
return CV_ELEM_SIZE1(flags);
}
inline int GpuMat::type() const
{
return CV_MAT_TYPE(flags);
}
inline int GpuMat::depth() const
{
return CV_MAT_DEPTH(flags);
}
inline int GpuMat::channels() const
{
return CV_MAT_CN(flags);
}
inline Size GpuMat::size() const
{
return Size(cols, rows);
}
inline uchar* GpuMat::ptr(int y)
{
CV_DbgAssert((unsigned)y < (unsigned)rows);
return data + step * y;
}
inline const uchar* GpuMat::ptr(int y) const
{
CV_DbgAssert((unsigned)y < (unsigned)rows);
return data + step * y;
}
inline GpuMat& GpuMat::operator = (Scalar s)
{
setTo(s);
return *this;
}
template <class T> inline GpuMat::operator DevMem2D_<T>() const
{
return DevMem2D_<T>(rows, cols, (T*)data, step);
}
template <class T> inline GpuMat::operator PtrStep_<T>() const
{
return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this));
}
inline GpuMat createContinuous(int rows, int cols, int type)
{
GpuMat m;
createContinuous(rows, cols, type, m);
return m;
}
inline void createContinuous(Size size, int type, GpuMat& m)
{
createContinuous(size.height, size.width, type, m);
}
inline GpuMat createContinuous(Size size, int type)
{
GpuMat m;
createContinuous(size, type, m);
return m;
}
inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)
{
ensureSizeIsEnough(size.height, size.width, type, m);
}
inline void createContinuous(int rows, int cols, int type, GpuMat& m)
{
int area = rows * cols;
if (!m.isContinuous() || m.type() != type || m.size().area() != area)
m.create(1, area, type);
m = m.reshape(0, rows);
}
inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
{
if (m.type() == type && m.rows >= rows && m.cols >= cols)
m = m(Rect(0, 0, cols, rows));
else
m.create(rows, cols, type);
}
}}
#endif // __OPENCV_GPUMAT_HPP__

460
modules/core/src/gpumat.cpp Normal file
View File

@ -0,0 +1,460 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include "opencv2/core/gpumat.hpp"
using namespace std;
using namespace cv;
using namespace cv::gpu;
cv::gpu::GpuMat::GpuMat(const GpuMat& m)
: flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)
{
if (refcount)
CV_XADD(refcount, 1);
}
cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(rows_), cols(cols_),
step(step_), data((uchar*)data_), refcount(0),
datastart((uchar*)data_), dataend((uchar*)data_)
{
size_t minstep = cols * elemSize();
if (step == Mat::AUTO_STEP)
{
step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
}
else
{
if (rows == 1)
step = minstep;
CV_DbgAssert(step >= minstep);
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
}
dataend += step * (rows - 1) + minstep;
}
cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(size_.height), cols(size_.width),
step(step_), data((uchar*)data_), refcount(0),
datastart((uchar*)data_), dataend((uchar*)data_)
{
size_t minstep = cols * elemSize();
if (step == Mat::AUTO_STEP)
{
step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
}
else
{
if (rows == 1)
step = minstep;
CV_DbgAssert(step >= minstep);
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
}
dataend += step * (rows - 1) + minstep;
}
cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range rowRange, Range colRange)
{
flags = m.flags;
step = m.step; refcount = m.refcount;
data = m.data; datastart = m.datastart; dataend = m.dataend;
if (rowRange == Range::all())
rows = m.rows;
else
{
CV_Assert(0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows);
rows = rowRange.size();
data += step*rowRange.start;
}
if (colRange == Range::all())
cols = m.cols;
else
{
CV_Assert(0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols);
cols = colRange.size();
data += colRange.start*elemSize();
flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
}
if (rows == 1)
flags |= Mat::CONTINUOUS_FLAG;
if (refcount)
CV_XADD(refcount, 1);
if (rows <= 0 || cols <= 0)
rows = cols = 0;
}
cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
flags(m.flags), rows(roi.height), cols(roi.width),
step(m.step), data(m.data + roi.y*step), refcount(m.refcount),
datastart(m.datastart), dataend(m.dataend)
{
flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
data += roi.x * elemSize();
CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows);
if (refcount)
CV_XADD(refcount, 1);
if (rows <= 0 || cols <= 0)
rows = cols = 0;
}
cv::gpu::GpuMat::GpuMat(const Mat& m) :
flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
upload(m);
}
GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m)
{
if (this != &m)
{
GpuMat temp(m);
swap(temp);
}
return *this;
}
void cv::gpu::GpuMat::swap(GpuMat& b)
{
std::swap(flags, b.flags);
std::swap(rows, b.rows);
std::swap(cols, b.cols);
std::swap(step, b.step);
std::swap(data, b.data);
std::swap(datastart, b.datastart);
std::swap(dataend, b.dataend);
std::swap(refcount, b.refcount);
}
void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const
{
size_t esz = elemSize();
ptrdiff_t delta1 = data - datastart;
ptrdiff_t delta2 = dataend - datastart;
CV_DbgAssert(step > 0);
if (delta1 == 0)
ofs.x = ofs.y = 0;
else
{
ofs.y = static_cast<int>(delta1 / step);
ofs.x = static_cast<int>((delta1 - step * ofs.y) / esz);
CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz);
}
size_t minstep = (ofs.x + cols) * esz;
wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / step + 1), ofs.y + rows);
wholeSize.width = std::max(static_cast<int>((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols);
}
GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)
{
Size wholeSize;
Point ofs;
locateROI(wholeSize, ofs);
size_t esz = elemSize();
int row1 = std::max(ofs.y - dtop, 0);
int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);
int col1 = std::max(ofs.x - dleft, 0);
int col2 = std::min(ofs.x + cols + dright, wholeSize.width);
data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;
rows = row2 - row1;
cols = col2 - col1;
if (esz * cols == step || rows == 1)
flags |= Mat::CONTINUOUS_FLAG;
else
flags &= ~Mat::CONTINUOUS_FLAG;
return *this;
}
GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
{
GpuMat hdr = *this;
int cn = channels();
if (new_cn == 0)
new_cn = cn;
int total_width = cols * cn;
if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
new_rows = rows * total_width / new_cn;
if (new_rows != 0 && new_rows != rows)
{
int total_size = total_width * rows;
if (!isContinuous())
CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
if ((unsigned)new_rows > (unsigned)total_size)
CV_Error(CV_StsOutOfRange, "Bad new number of rows");
total_width = total_size / new_rows;
if (total_width * new_rows != total_size)
CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
hdr.rows = new_rows;
hdr.step = total_width * elemSize1();
}
int new_width = total_width / new_cn;
if (new_width * new_cn != total_width)
CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");
hdr.cols = new_width;
hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
return hdr;
}
cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows)
{
m.download(*this);
}
namespace
{
void throw_nogpu()
{
CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support");
}
class EmptyFuncTable : public GpuFuncTable
{
public:
void copy(const Mat&, GpuMat&) const { throw_nogpu(); }
void copy(const GpuMat&, Mat&) const { throw_nogpu(); }
void copy(const GpuMat&, GpuMat&) const { throw_nogpu(); }
void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu(); }
void convert(const GpuMat&, GpuMat&) const { throw_nogpu(); }
void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu(); }
void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu(); }
void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu(); }
void free(void*) const {}
};
const GpuFuncTable* g_funcTbl = 0;
const GpuFuncTable* gpuFuncTable()
{
static EmptyFuncTable empty;
return g_funcTbl ? g_funcTbl : &empty;
}
}
void cv::gpu::setGpuFuncTable(const GpuFuncTable* funcTbl)
{
g_funcTbl = funcTbl;
}
void cv::gpu::GpuMat::upload(const Mat& m)
{
CV_DbgAssert(!m.empty());
create(m.size(), m.type());
gpuFuncTable()->copy(m, *this);
}
void cv::gpu::GpuMat::download(Mat& m) const
{
CV_DbgAssert(!empty());
m.create(size(), type());
gpuFuncTable()->copy(*this, m);
}
void cv::gpu::GpuMat::copyTo(GpuMat& m) const
{
CV_DbgAssert(!empty());
m.create(size(), type());
gpuFuncTable()->copy(*this, m);
}
void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
{
if (mask.empty())
copyTo(mat);
else
{
mat.create(size(), type());
gpuFuncTable()->copyWithMask(*this, mat, mask);
}
}
void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const
{
bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();
if (rtype < 0)
rtype = type();
else
rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
int sdepth = depth();
int ddepth = CV_MAT_DEPTH(rtype);
if (sdepth == ddepth && noScale)
{
copyTo(dst);
return;
}
GpuMat temp;
const GpuMat* psrc = this;
if (sdepth != ddepth && psrc == &dst)
{
temp = *this;
psrc = &temp;
}
dst.create(size(), rtype);
if (noScale)
gpuFuncTable()->convert(*psrc, dst);
else
gpuFuncTable()->convert(*psrc, dst, alpha, beta);
}
GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
{
CV_Assert(mask.empty() || mask.type() == CV_8UC1);
CV_DbgAssert(!empty());
gpuFuncTable()->setTo(*this, s, mask);
return *this;
}
void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
{
_type &= TYPE_MASK;
if (rows == _rows && cols == _cols && type() == _type && data)
return;
if (data)
release();
CV_DbgAssert(_rows >= 0 && _cols >= 0);
if (_rows > 0 && _cols > 0)
{
flags = Mat::MAGIC_VAL + _type;
rows = _rows;
cols = _cols;
size_t esz = elemSize();
void* devPtr;
gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows);
// Single row must be continuous
if (rows == 1)
step = esz * cols;
if (esz * cols == step)
flags |= Mat::CONTINUOUS_FLAG;
int64 _nettosize = static_cast<int64>(step) * rows;
size_t nettosize = static_cast<size_t>(_nettosize);
datastart = data = static_cast<uchar*>(devPtr);
dataend = data + nettosize;
refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));
*refcount = 1;
}
}
void cv::gpu::GpuMat::release()
{
if (refcount && CV_XADD(refcount, -1) == 1)
{
fastFree(refcount);
gpuFuncTable()->free(datastart);
}
data = datastart = dataend = 0;
step = rows = cols = 0;
refcount = 0;
}

View File

@ -3,7 +3,8 @@ set(name "gpu")
set(the_target "opencv_${name}") set(the_target "opencv_${name}")
project(${the_target}) project(${the_target})
set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed set(DEPS "opencv_core" "opencv_imgproc" "opencv_calib3d" "opencv_objdetect")
set(DEPS_HEADER ${DEPS} "opencv_features2d" "opencv_flann")
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu) set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
@ -27,6 +28,13 @@ file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*")
source_group("Device" FILES ${lib_device_hdrs}) source_group("Device" FILES ${lib_device_hdrs})
source_group("Device\\Detail" FILES ${lib_device_hdrs_detail}) source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
foreach(d ${DEPS_HEADER})
if(${d} MATCHES "opencv_")
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endforeach()
if (HAVE_CUDA) if (HAVE_CUDA)
file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp") file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")
file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu") file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
@ -51,7 +59,6 @@ if (HAVE_CUDA)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;") set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;")
endif() endif()
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
@ -60,7 +67,7 @@ if (HAVE_CUDA)
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
#string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408 /wd4251")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
@ -74,17 +81,14 @@ if (HAVE_CUDA)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS") set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
endif() endif()
if(MSVC)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/wd4251")
endif()
CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda}) CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
#CUDA_BUILD_CLEAN_TARGET() #CUDA_BUILD_CLEAN_TARGET()
endif() endif()
foreach(d ${DEPS})
if(${d} MATCHES "opencv_")
string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
include_directories("${d_dir}/include")
endif()
endforeach()
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs}) add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
# For dynamic link numbering convenions # For dynamic link numbering convenions

View File

@ -40,122 +40,4 @@
// //
//M*/ //M*/
#ifndef __OPENCV_GPU_DevMem2D_HPP__ #include "opencv2/core/devmem2d.hpp"
#define __OPENCV_GPU_DevMem2D_HPP__
namespace cv
{
namespace gpu
{
// Simple lightweight structures that encapsulates information about an image on device.
// It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
#if defined(__CUDACC__)
#define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
#else
#define __CV_GPU_HOST_DEVICE__
#endif
template <bool expr> struct StaticAssert;
template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};
template<typename T> struct DevPtr
{
typedef T elem_type;
typedef int index_type;
enum { elem_size = sizeof(elem_type) };
T* data;
__CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}
__CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
__CV_GPU_HOST_DEVICE__ operator T*() { return data; }
__CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
};
template<typename T> struct PtrSz : public DevPtr<T>
{
__CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
__CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
size_t size;
};
template<typename T> struct PtrStep : public DevPtr<T>
{
__CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
__CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
/** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
size_t step;
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return ( T*)( ( char*)DevPtr<T>::data + y * step); }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
__CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
};
template <typename T> struct PtrStepSz : public PtrStep<T>
{
__CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
__CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_)
: PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
int cols;
int rows;
};
template <typename T> struct DevMem2D_ : public PtrStepSz<T>
{
DevMem2D_() {}
DevMem2D_(int rows_, int cols_, T *data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
template <typename U>
explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}
};
template<typename T> struct PtrElemStep_ : public PtrStep<T>
{
PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step)
{
StaticAssert<256 % sizeof(T) == 0>::check();
PtrStep<T>::step /= PtrStep<T>::elem_size;
}
__CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }
__CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }
__CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
};
template<typename T> struct PtrStep_ : public PtrStep<T>
{
PtrStep_() {}
PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}
};
#undef __CV_GPU_HOST_DEVICE__
typedef DevMem2D_<unsigned char> DevMem2Db;
typedef DevMem2Db DevMem2D;
typedef DevMem2D_<float> DevMem2Df;
typedef DevMem2D_<int> DevMem2Di;
typedef PtrStep<unsigned char> PtrStepb;
typedef PtrStep<float> PtrStepf;
typedef PtrStep<int> PtrStepi;
typedef PtrElemStep_<unsigned char> PtrElemStep;
typedef PtrElemStep_<float> PtrElemStepf;
typedef PtrElemStep_<int> PtrElemStepi;
}
}
#endif /* __OPENCV_GPU_DevMem2D_HPP__ */

File diff suppressed because it is too large Load Diff

View File

@ -40,427 +40,4 @@
// //
//M*/ //M*/
#ifndef __OPENCV_GPUMAT_HPP__ #include "opencv2/core/gpumat.hpp"
#define __OPENCV_GPUMAT_HPP__
#include "opencv2/core/core.hpp"
#include "opencv2/gpu/devmem2d.hpp"
namespace cv { namespace gpu
{
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
class CV_EXPORTS GpuMat
{
public:
//! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
// Contains just image size, data ptr and step.
template <class T> operator DevMem2D_<T>() const;
template <class T> operator PtrStep_<T>() const;
template <class T> operator PtrStep<T>() const;
//! builds GpuMat from Mat. Perfom blocking upload to device.
explicit GpuMat(const Mat& m);
//! pefroms blocking upload data to GpuMat.
void upload(const Mat& m);
//! downloads data from device to host memory. Blocking calls.
void download(Mat& m) const;
operator Mat() const
{
Mat m;
download(m);
return m;
}
//! default constructor
GpuMat();
//! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
GpuMat(int rows, int cols, int type);
GpuMat(Size size, int type);
//! constucts GpuMatrix and fills it with the specified value _s.
GpuMat(int rows, int cols, int type, const Scalar& s);
GpuMat(Size size, int type, const Scalar& s);
//! copy constructor
GpuMat(const GpuMat& m);
//! constructor for GpuMatrix headers pointing to user-allocated data
GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
//! creates a matrix header for a part of the bigger matrix
GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange);
GpuMat(const GpuMat& m, const Rect& roi);
//! destructor - calls release()
~GpuMat();
//! assignment operators
GpuMat& operator = (const GpuMat& m);
//! returns a new GpuMatrix header for the specified row
GpuMat row(int y) const;
//! returns a new GpuMatrix header for the specified column
GpuMat col(int x) const;
//! ... for the specified row span
GpuMat rowRange(int startrow, int endrow) const;
GpuMat rowRange(const Range& r) const;
//! ... for the specified column span
GpuMat colRange(int startcol, int endcol) const;
GpuMat colRange(const Range& r) const;
//! returns deep copy of the GpuMatrix, i.e. the data is copied
GpuMat clone() const;
//! copies the GpuMatrix content to "m".
// It calls m.create(this->size(), this->type()).
void copyTo(GpuMat& m) const;
//! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.
void copyTo(GpuMat& m, const GpuMat& mask) const;
//! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.
void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;
void assignTo(GpuMat& m, int type=-1) const;
//! sets every GpuMatrix element to s
GpuMat& operator = (const Scalar& s);
//! sets some of the GpuMatrix elements to s, according to the mask
GpuMat& setTo(const Scalar& s, const GpuMat& mask = GpuMat());
//! creates alternative GpuMatrix header for the same data, with different
// number of channels and/or different number of rows. see cvReshape.
GpuMat reshape(int cn, int rows = 0) const;
//! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.
// previous data is unreferenced if needed.
void create(int rows, int cols, int type);
void create(Size size, int type);
//! decreases reference counter;
// deallocate the data when reference counter reaches 0.
void release();
//! swaps with other smart pointer
void swap(GpuMat& mat);
//! locates GpuMatrix header within a parent GpuMatrix. See below
void locateROI(Size& wholeSize, Point& ofs) const;
//! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.
GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
//! extracts a rectangular sub-GpuMatrix
// (this is a generalized form of row, rowRange etc.)
GpuMat operator()(Range rowRange, Range colRange) const;
GpuMat operator()(const Rect& roi) const;
//! returns true iff the GpuMatrix data is continuous
// (i.e. when there are no gaps between successive rows).
// similar to CV_IS_GpuMat_CONT(cvGpuMat->type)
bool isContinuous() const;
//! returns element size in bytes,
// similar to CV_ELEM_SIZE(cvMat->type)
size_t elemSize() const;
//! returns the size of element channel in bytes.
size_t elemSize1() const;
//! returns element type, similar to CV_MAT_TYPE(cvMat->type)
int type() const;
//! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
int depth() const;
//! returns element type, similar to CV_MAT_CN(cvMat->type)
int channels() const;
//! returns step/elemSize1()
size_t step1() const;
//! returns GpuMatrix size:
// width == number of columns, height == number of rows
Size size() const;
//! returns true if GpuMatrix data is NULL
bool empty() const;
//! returns pointer to y-th row
uchar* ptr(int y = 0);
const uchar* ptr(int y = 0) const;
//! template version of the above method
template<typename _Tp> _Tp* ptr(int y = 0);
template<typename _Tp> const _Tp* ptr(int y = 0) const;
/*! includes several bit-fields:
- the magic signature
- continuity flag
- depth
- number of channels
*/
int flags;
//! the number of rows and columns
int rows, cols;
//! a distance between successive rows in bytes; includes the gap if any
size_t step;
//! pointer to the data
uchar* data;
//! pointer to the reference counter;
// when GpuMatrix points to user-allocated data, the pointer is NULL
int* refcount;
//! helper fields used in locateROI and adjustROI
uchar* datastart;
uchar* dataend;
};
//! Creates continuous GPU matrix
CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);
CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);
CV_EXPORTS GpuMat createContinuous(Size size, int type);
//! Ensures that size of the given matrix is not less than (rows, cols) size
//! and matrix type is match specified one too
CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
////////////////////////////////////////////////////////////////////////
template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); }
template <class T> inline GpuMat::operator PtrStep_<T>() const { return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this)); }
template <class T> inline GpuMat::operator PtrStep<T>() const { return PtrStep<T>((T*)data, step); }
inline GpuMat::GpuMat()
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
}
inline GpuMat::GpuMat(int rows_, int cols_, int type_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (rows_ > 0 && cols_ > 0)
create(rows_, cols_, type_);
}
inline GpuMat::GpuMat(Size size_, int type_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (size_.height > 0 && size_.width > 0)
create(size_.height, size_.width, type_);
}
inline GpuMat::GpuMat(int rows_, int cols_, int type_, const Scalar& s_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (rows_ > 0 && cols_ > 0)
{
create(rows_, cols_, type_);
setTo(s_);
}
}
inline GpuMat::GpuMat(Size size_, int type_, const Scalar& s_)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
if (size_.height > 0 && size_.width > 0)
{
create(size_.height, size_.width, type_);
setTo(s_);
}
}
inline GpuMat::~GpuMat()
{
release();
}
inline GpuMat GpuMat::clone() const
{
GpuMat m;
copyTo(m);
return m;
}
inline void GpuMat::assignTo(GpuMat& m, int type) const
{
if (type < 0)
m = *this;
else
convertTo(m, type);
}
inline size_t GpuMat::step1() const
{
return step / elemSize1();
}
inline bool GpuMat::empty() const
{
return data == 0;
}
template<typename _Tp> inline _Tp* GpuMat::ptr(int y)
{
return (_Tp*)ptr(y);
}
template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const
{
return (const _Tp*)ptr(y);
}
inline void swap(GpuMat& a, GpuMat& b)
{
a.swap(b);
}
inline GpuMat GpuMat::row(int y) const
{
return GpuMat(*this, Range(y, y+1), Range::all());
}
inline GpuMat GpuMat::col(int x) const
{
return GpuMat(*this, Range::all(), Range(x, x+1));
}
inline GpuMat GpuMat::rowRange(int startrow, int endrow) const
{
return GpuMat(*this, Range(startrow, endrow), Range::all());
}
inline GpuMat GpuMat::rowRange(const Range& r) const
{
return GpuMat(*this, r, Range::all());
}
inline GpuMat GpuMat::colRange(int startcol, int endcol) const
{
return GpuMat(*this, Range::all(), Range(startcol, endcol));
}
inline GpuMat GpuMat::colRange(const Range& r) const
{
return GpuMat(*this, Range::all(), r);
}
inline void GpuMat::create(Size size_, int type_)
{
create(size_.height, size_.width, type_);
}
inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const
{
return GpuMat(*this, rowRange, colRange);
}
inline GpuMat GpuMat::operator()(const Rect& roi) const
{
return GpuMat(*this, roi);
}
inline bool GpuMat::isContinuous() const
{
return (flags & Mat::CONTINUOUS_FLAG) != 0;
}
inline size_t GpuMat::elemSize() const
{
return CV_ELEM_SIZE(flags);
}
inline size_t GpuMat::elemSize1() const
{
return CV_ELEM_SIZE1(flags);
}
inline int GpuMat::type() const
{
return CV_MAT_TYPE(flags);
}
inline int GpuMat::depth() const
{
return CV_MAT_DEPTH(flags);
}
inline int GpuMat::channels() const
{
return CV_MAT_CN(flags);
}
inline Size GpuMat::size() const
{
return Size(cols, rows);
}
inline unsigned char* GpuMat::ptr(int y)
{
CV_DbgAssert((unsigned)y < (unsigned)rows);
return data + step * y;
}
inline const unsigned char* GpuMat::ptr(int y) const
{
CV_DbgAssert((unsigned)y < (unsigned)rows);
return data + step * y;
}
inline GpuMat& GpuMat::operator = (const Scalar& s)
{
setTo(s);
return *this;
}
inline GpuMat createContinuous(int rows, int cols, int type)
{
GpuMat m;
createContinuous(rows, cols, type, m);
return m;
}
inline void createContinuous(Size size, int type, GpuMat& m)
{
createContinuous(size.height, size.width, type, m);
}
inline GpuMat createContinuous(Size size, int type)
{
GpuMat m;
createContinuous(size, type, m);
return m;
}
inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)
{
ensureSizeIsEnough(size.height, size.width, type, m);
}
inline void createContinuous(int rows, int cols, int type, GpuMat& m)
{
int area = rows * cols;
if (!m.isContinuous() || m.type() != type || m.size().area() != area)
m.create(1, area, type);
m = m.reshape(0, rows);
}
inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
{
if (m.type() == type && m.rows >= rows && m.cols >= cols)
m = m(Rect(0, 0, cols, rows));
else
m.create(rows, cols, type);
}
}}
#endif // __OPENCV_GPUMAT_HPP__

View File

@ -1,142 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
#define __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
namespace cv
{
namespace gpu
{
///////////////////////////////////////////////////////////////////////
//////////////////////////////// CudaMem ////////////////////////////////
///////////////////////////////////////////////////////////////////////
inline CudaMem::CudaMem() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) {}
inline CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
{
if( _rows > 0 && _cols > 0 )
create( _rows, _cols, _type, _alloc_type);
}
inline CudaMem::CudaMem(Size _size, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
{
if( _size.height > 0 && _size.width > 0 )
create( _size.height, _size.width, _type, _alloc_type);
}
inline CudaMem::CudaMem(const CudaMem& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
{
if( refcount )
CV_XADD(refcount, 1);
}
inline CudaMem::CudaMem(const Mat& m, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
{
if( m.rows > 0 && m.cols > 0 )
create( m.size(), m.type(), _alloc_type);
Mat tmp = createMatHeader();
m.copyTo(tmp);
}
inline CudaMem::~CudaMem()
{
release();
}
inline CudaMem& CudaMem::operator = (const CudaMem& m)
{
if( this != &m )
{
if( m.refcount )
CV_XADD(m.refcount, 1);
release();
flags = m.flags;
rows = m.rows; cols = m.cols;
step = m.step; data = m.data;
datastart = m.datastart;
dataend = m.dataend;
refcount = m.refcount;
alloc_type = m.alloc_type;
}
return *this;
}
inline CudaMem CudaMem::clone() const
{
CudaMem m(size(), type(), alloc_type);
Mat to = m;
Mat from = *this;
from.copyTo(to);
return m;
}
inline void CudaMem::create(Size _size, int _type, int _alloc_type) { create(_size.height, _size.width, _type, _alloc_type); }
//CCP void CudaMem::create(int _rows, int _cols, int _type, int _alloc_type);
//CPP void CudaMem::release();
inline Mat CudaMem::createMatHeader() const { return Mat(size(), type(), data, step); }
inline CudaMem::operator Mat() const { return createMatHeader(); }
inline CudaMem::operator GpuMat() const { return createGpuMatHeader(); }
//CPP GpuMat CudaMem::createGpuMatHeader() const;
inline bool CudaMem::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
inline size_t CudaMem::elemSize() const { return CV_ELEM_SIZE(flags); }
inline size_t CudaMem::elemSize1() const { return CV_ELEM_SIZE1(flags); }
inline int CudaMem::type() const { return CV_MAT_TYPE(flags); }
inline int CudaMem::depth() const { return CV_MAT_DEPTH(flags); }
inline int CudaMem::channels() const { return CV_MAT_CN(flags); }
inline size_t CudaMem::step1() const { return step/elemSize1(); }
inline Size CudaMem::size() const { return Size(cols, rows); }
inline bool CudaMem::empty() const { return data == 0; }
} /* end of namespace gpu */
} /* end of namespace cv */
#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */

View File

@ -24,7 +24,7 @@ PERF_TEST_P(DevInfo_Size_MatType, transpose, testing::Combine(testing::ValuesIn(
transpose(src, dst); transpose(src, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -55,7 +55,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, flip, testing::Combine(testing::Value
flip(src, dst, flipCode); flip(src, dst, flipCode);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_Size_MatType, LUT, testing::Combine(testing::ValuesIn(device
LUT(src, lut, dst); LUT(src, lut, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -115,8 +115,8 @@ PERF_TEST_P(DevInfo_Size, cartToPolar, testing::Combine(testing::ValuesIn(device
cartToPolar(x, y, magnitude, angle); cartToPolar(x, y, magnitude, angle);
} }
Mat magnitude_host = magnitude; Mat magnitude_host(magnitude);
Mat angle_host = angle; Mat angle_host(angle);
SANITY_CHECK(magnitude_host); SANITY_CHECK(magnitude_host);
SANITY_CHECK(angle_host); SANITY_CHECK(angle_host);
@ -147,8 +147,8 @@ PERF_TEST_P(DevInfo_Size, polarToCart, testing::Combine(testing::ValuesIn(device
polarToCart(magnitude, angle, x, y); polarToCart(magnitude, angle, x, y);
} }
Mat x_host = x; Mat x_host(x);
Mat y_host = angle; Mat y_host(y);
SANITY_CHECK(x_host); SANITY_CHECK(x_host);
SANITY_CHECK(y_host); SANITY_CHECK(y_host);
@ -180,7 +180,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addMat, testing::Combine(testing::ValuesIn(dev
add(a, b, c); add(a, b, c);
} }
Mat c_host = c; Mat c_host(c);
SANITY_CHECK(c_host); SANITY_CHECK(c_host);
} }
@ -210,7 +210,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addScalar, testing::Combine(testing::ValuesIn(
add(a, b, c); add(a, b, c);
} }
Mat c_host = c; Mat c_host(c);
SANITY_CHECK(c_host); SANITY_CHECK(c_host);
} }
@ -241,7 +241,7 @@ PERF_TEST_P(DevInfo_Size_MatType, subtractMat, testing::Combine(testing::ValuesI
subtract(a, b, c); subtract(a, b, c);
} }
Mat c_host = c; Mat c_host(c);
SANITY_CHECK(c_host); SANITY_CHECK(c_host);
} }
@ -270,7 +270,7 @@ PERF_TEST_P(DevInfo_Size, multiplyMat, testing::Combine(testing::ValuesIn(device
multiply(a, b, c); multiply(a, b, c);
} }
Mat c_host = c; Mat c_host(c);
SANITY_CHECK(c_host); SANITY_CHECK(c_host);
} }
@ -300,7 +300,7 @@ PERF_TEST_P(DevInfo_Size_MatType, multiplyScalar, testing::Combine(testing::Valu
multiply(a, b, c); multiply(a, b, c);
} }
Mat c_host = c; Mat c_host(c);
SANITY_CHECK(c_host); SANITY_CHECK(c_host);
} }
@ -327,7 +327,7 @@ PERF_TEST_P(DevInfo_Size, exp, testing::Combine(testing::ValuesIn(devices()),
exp(a, b); exp(a, b);
} }
Mat b_host = b; Mat b_host(b);
SANITY_CHECK(b_host); SANITY_CHECK(b_host);
} }
@ -356,7 +356,7 @@ PERF_TEST_P(DevInfo_Size_MatType, pow, testing::Combine(testing::ValuesIn(device
pow(src, 2.0, dst); pow(src, 2.0, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -389,7 +389,7 @@ PERF_TEST_P(DevInfo_Size_MatType_CmpOp, compare, testing::Combine(testing::Value
compare(src1, src2, dst, cmpop); compare(src1, src2, dst, cmpop);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -418,7 +418,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_not, testing::Combine(testing::ValuesI
bitwise_not(src, dst); bitwise_not(src, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -449,7 +449,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_and, testing::Combine(testing::ValuesI
bitwise_and(src1, src2, dst); bitwise_and(src1, src2, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -480,7 +480,7 @@ PERF_TEST_P(DevInfo_Size_MatType, min, testing::Combine(testing::ValuesIn(device
min(src1, src2, dst); min(src1, src2, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -712,7 +712,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI
addWeighted(src1, 0.5, src2, 0.5, 0.0, dst); addWeighted(src1, 0.5, src2, 0.5, 0.0, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -743,7 +743,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::Val
reduce(src, dst, dim, CV_REDUCE_MIN); reduce(src, dst, dim, CV_REDUCE_MIN);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -774,7 +774,7 @@ PERF_TEST_P(DevInfo_Size, gemm, testing::Combine(testing::ValuesIn(devices()),
gemm(src1, src2, 1.0, src3, 1.0, dst); gemm(src1, src2, 1.0, src3, 1.0, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }

View File

@ -20,7 +20,7 @@ PERF_TEST_P(DevInfo, transformPoints, testing::ValuesIn(devices()))
transformPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), dst); transformPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -45,7 +45,7 @@ PERF_TEST_P(DevInfo, projectPoints, testing::ValuesIn(devices()))
projectPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), Mat::ones(3, 3, CV_32FC1), Mat(), dst); projectPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), Mat::ones(3, 3, CV_32FC1), Mat(), dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }

View File

@ -28,7 +28,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing
filter->apply(src, dst); filter->apply(src, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
filter->apply(src, dst); filter->apply(src, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -96,7 +96,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
filter->apply(src, dst); filter->apply(src, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -130,7 +130,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, separableLinearFilter, testing::Com
filter->apply(src, dst, Rect(0, 0, src.cols, src.rows)); filter->apply(src, dst, Rect(0, 0, src.cols, src.rows));
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }

View File

@ -36,7 +36,7 @@ PERF_TEST_P(DevInfo_Size_MatType_Interpolation_BorderMode, remap, testing::Combi
remap(src, dst, xmap, ymap, interpolation, borderMode); remap(src, dst, xmap, ymap, interpolation, borderMode);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo, meanShiftFiltering, testing::ValuesIn(devices()))
meanShiftFiltering(src, dst, 50, 50); meanShiftFiltering(src, dst, 50, 50);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -91,8 +91,8 @@ PERF_TEST_P(DevInfo, meanShiftProc, testing::ValuesIn(devices()))
meanShiftProc(src, dstr, dstsp, 50, 50); meanShiftProc(src, dstr, dstsp, 50, 50);
} }
Mat dstr_host = dstr; Mat dstr_host(dstr);
Mat dstsp_host = dstsp; Mat dstsp_host(dstsp);
SANITY_CHECK(dstr_host); SANITY_CHECK(dstr_host);
SANITY_CHECK(dstsp_host); SANITY_CHECK(dstsp_host);

View File

@ -25,7 +25,7 @@ PERF_TEST_P(DevInfo_Size_MatType, merge, testing::Combine(testing::ValuesIn(devi
merge(src, dst); merge(src, dst);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -82,7 +82,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setTo, testing::Combine(testing::ValuesIn(devi
src.setTo(val); src.setTo(val);
} }
Mat src_host = src; Mat src_host(src);
SANITY_CHECK(src_host); SANITY_CHECK(src_host);
} }
@ -115,7 +115,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setToMasked, testing::Combine(testing::ValuesI
src.setTo(val, mask); src.setTo(val, mask);
} }
src_host = src; src.download(src_host);
SANITY_CHECK(src_host); SANITY_CHECK(src_host);
} }
@ -148,7 +148,7 @@ PERF_TEST_P(DevInfo_Size_MatType, copyToMasked, testing::Combine(testing::Values
src.copyTo(dst, mask); src.copyTo(dst, mask);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
@ -182,7 +182,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MatType, convertTo, testing::Combine(testing::V
src.convertTo(dst, type2, a, b); src.convertTo(dst, type2, a, b);
} }
Mat dst_host = dst; Mat dst_host(dst);
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }

View File

@ -425,16 +425,22 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Polar <-> Cart // Polar <-> Cart
namespace cv { namespace gpu { namespace mathfunc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace mathfunc
{ {
void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream); void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream); void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream) inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
CV_DbgAssert(x.size() == y.size() && x.type() == y.type()); CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
CV_Assert(x.depth() == CV_32F); CV_Assert(x.depth() == CV_32F);
@ -448,11 +454,13 @@ namespace
GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat(); GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat(); GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();
mathfunc::cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream); cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
} }
inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream) inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type()); CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
CV_Assert(mag.depth() == CV_32F); CV_Assert(mag.depth() == CV_32F);
@ -464,34 +472,33 @@ namespace
GpuMat x1cn = x.reshape(1); GpuMat x1cn = x.reshape(1);
GpuMat y1cn = y.reshape(1); GpuMat y1cn = y.reshape(1);
mathfunc::polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream); polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
} }
} }
void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream) void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
{ {
::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream)); cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
} }
void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream) void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
{ {
::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream)); cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
} }
void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream) void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
{ {
::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
} }
void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream) void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
{ {
::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
} }
void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream) void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
{ {
::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream)); polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

View File

@ -55,13 +55,19 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&,
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace bf BEGIN_OPENCV_DEVICE_NAMESPACE
{
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc);
void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream); namespace bilateral_filter
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream); {
}}} void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
}
END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;
namespace namespace
{ {
@ -105,7 +111,7 @@ namespace
short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5)); short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
short max_disc = short(ndisp * max_disc_threshold + 0.5); short max_disc = short(ndisp * max_disc_threshold + 0.5);
bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc); load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
if (&dst != &disp) if (&dst != &disp)
{ {
@ -115,7 +121,7 @@ namespace
disp.copyTo(dst); disp.copyTo(dst);
} }
bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream)); bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
} }
typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,

View File

@ -52,15 +52,19 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu
#else #else
namespace cv { namespace gpu BEGIN_OPENCV_DEVICE_NAMESPACE
namespace blend
{ {
template <typename T> template <typename T>
void blendLinearCaller(int rows, int cols, int cn, const PtrStep<T>& img1, const PtrStep<T>& img2, void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
const PtrStepf& weights1, const PtrStepf& weights2, PtrStep<T> result, cudaStream_t stream);
void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream); }
}}
END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE_ blend;
void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
GpuMat& result, Stream& stream) GpuMat& result, Stream& stream)

View File

@ -82,7 +82,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace bf_match BEGIN_OPENCV_DEVICE_NAMESPACE
namespace bf_match
{ {
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
@ -103,9 +105,9 @@ namespace cv { namespace gpu { namespace bf_match
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream); int cc, cudaStream_t stream);
}}} }
namespace cv { namespace gpu { namespace bf_knnmatch namespace bf_knnmatch
{ {
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
@ -126,9 +128,9 @@ namespace cv { namespace gpu { namespace bf_knnmatch
template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
int cc, cudaStream_t stream); int cc, cudaStream_t stream);
}}} }
namespace cv { namespace gpu { namespace bf_radius_match namespace bf_radius_match
{ {
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
@ -151,15 +153,17 @@ namespace cv { namespace gpu { namespace bf_radius_match
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream); int cc, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
////////////////////////////////////////////////////////////////////
// Train collection
cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)
{ {
} }
////////////////////////////////////////////////////////////////////
// Train collection
void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection) void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)
{ {
trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end()); trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
@ -195,7 +199,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
if (query.empty() || train.empty()) if (query.empty() || train.empty())
return; return;
using namespace cv::gpu::bf_match; using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
@ -242,8 +246,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
if (trainIdx.empty() || distance.empty()) if (trainIdx.empty() || distance.empty())
return; return;
Mat trainIdxCPU = trainIdx; Mat trainIdxCPU(trainIdx);
Mat distanceCPU = distance; Mat distanceCPU(distance);
matchConvert(trainIdxCPU, distanceCPU, matches); matchConvert(trainIdxCPU, distanceCPU, matches);
} }
@ -337,7 +341,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
if (query.empty() || trainCollection.empty()) if (query.empty() || trainCollection.empty())
return; return;
using namespace cv::gpu::bf_match; using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
@ -384,9 +388,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
if (trainIdx.empty() || imgIdx.empty() || distance.empty()) if (trainIdx.empty() || imgIdx.empty() || distance.empty())
return; return;
Mat trainIdxCPU = trainIdx; Mat trainIdxCPU(trainIdx);
Mat imgIdxCPU = imgIdx; Mat imgIdxCPU(imgIdx);
Mat distanceCPU = distance; Mat distanceCPU(distance);
matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches); matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
} }
@ -448,7 +452,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
if (query.empty() || train.empty()) if (query.empty() || train.empty())
return; return;
using namespace cv::gpu::bf_knnmatch; using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
@ -511,8 +515,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
if (trainIdx.empty() || distance.empty()) if (trainIdx.empty() || distance.empty())
return; return;
Mat trainIdxCPU = trainIdx; Mat trainIdxCPU(trainIdx);
Mat distanceCPU = distance; Mat distanceCPU(distance);
knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult); knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
} }
@ -577,7 +581,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
if (query.empty() || trainCollection.empty()) if (query.empty() || trainCollection.empty())
return; return;
using namespace cv::gpu::bf_knnmatch; using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
@ -630,9 +634,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainI
if (trainIdx.empty() || imgIdx.empty() || distance.empty()) if (trainIdx.empty() || imgIdx.empty() || distance.empty())
return; return;
Mat trainIdxCPU = trainIdx; Mat trainIdxCPU(trainIdx);
Mat imgIdxCPU = imgIdx; Mat imgIdxCPU(imgIdx);
Mat distanceCPU = distance; Mat distanceCPU(distance);
knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult); knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);
} }
@ -758,7 +762,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
if (query.empty() || train.empty()) if (query.empty() || train.empty())
return; return;
using namespace cv::gpu::bf_radius_match; using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
@ -819,9 +823,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
if (trainIdx.empty() || distance.empty() || nMatches.empty()) if (trainIdx.empty() || distance.empty() || nMatches.empty())
return; return;
Mat trainIdxCPU = trainIdx; Mat trainIdxCPU(trainIdx);
Mat distanceCPU = distance; Mat distanceCPU(distance);
Mat nMatchesCPU = nMatches; Mat nMatchesCPU(nMatches);
radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult); radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
} }
@ -889,7 +893,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
if (query.empty() || empty()) if (query.empty() || empty())
return; return;
using namespace cv::gpu::bf_radius_match; using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;
typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
@ -953,10 +957,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty()) if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
return; return;
Mat trainIdxCPU = trainIdx; Mat trainIdxCPU(trainIdx);
Mat imgIdxCPU = imgIdx; Mat imgIdxCPU(imgIdx);
Mat distanceCPU = distance; Mat distanceCPU(distance);
Mat nMatchesCPU = nMatches; Mat nMatchesCPU(nMatches);
radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult); radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
} }

View File

@ -42,6 +42,10 @@
#include "precomp.hpp" #include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
using namespace std;
#if !defined(HAVE_CUDA) #if !defined(HAVE_CUDA)
void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
@ -52,13 +56,31 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat
#else #else
using namespace cv; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu;
namespace cv { namespace gpu { namespace transform_points namespace transform_points
{ {
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream); void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
}}} }
namespace project_points
{
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
}
namespace solve_pnp_ransac
{
int maxNumIters();
void computeHypothesisScores(
const int num_hypotheses, const int num_points, const float* rot_matrices,
const float3* transl_vectors, const float3* object, const float2* image,
const float dist_threshold, int* hypothesis_scores);
}
END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE;
namespace namespace
{ {
@ -79,15 +101,9 @@ namespace
void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream) void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
{ {
::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream)); transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
} }
namespace cv { namespace gpu { namespace project_points
{
void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
}}}
namespace namespace
{ {
void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream) void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
@ -109,20 +125,9 @@ namespace
void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream) void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
{ {
::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream)); projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
} }
namespace cv { namespace gpu { namespace solve_pnp_ransac
{
int maxNumIters();
void computeHypothesisScores(
const int num_hypotheses, const int num_points, const float* rot_matrices,
const float3* transl_vectors, const float3* object, const float2* image,
const float dist_threshold, int* hypothesis_scores);
}}}
namespace namespace
{ {
// Selects subset_size random different points from [0, num_points - 1] range // Selects subset_size random different points from [0, num_points - 1] range

View File

@ -46,7 +46,6 @@ using namespace cv;
using namespace cv::gpu; using namespace cv::gpu;
using namespace std; using namespace std;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() { throw_nogpu(); } cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() { throw_nogpu(); }

View File

@ -51,155 +51,158 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \ #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream); void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \ #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \ #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray) OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra) OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
}}}
END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE;
namespace namespace
{ {

View File

@ -45,19 +45,18 @@
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_knnmatch namespace bf_knnmatch {
{
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int BLOCK_SIZE> ///////////////////////////////////////////////////////////////////////////////
__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, // Reduction
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx) float* s_distance, int* s_trainIdx)
{ {
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
@ -121,14 +120,14 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2; bestTrainIdx2 = myBestTrainIdx2;
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2, int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx) float* s_distance, int* s_trainIdx, int* s_imgIdx)
{ {
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
@ -204,29 +203,29 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestImgIdx1 = myBestImgIdx1; bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2; bestImgIdx2 = myBestImgIdx2;
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match Unrolled Cached // Match Unrolled Cached
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query) __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
{ {
#pragma unroll #pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{ {
const int loadX = threadIdx.x + i * BLOCK_SIZE; const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0; s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
}
} }
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train, typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2, float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2) int& bestImgIdx1, int& bestImgIdx2)
{ {
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{ {
Dist dist; Dist dist;
@ -242,7 +241,7 @@ namespace cv { namespace gpu { namespace bf_knnmatch
{ {
T val; T val;
ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -279,11 +278,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
} }
} }
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -312,13 +311,13 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -329,11 +328,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -373,13 +372,13 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -390,18 +389,18 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match Unrolled // Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train, typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2, float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2) int& bestImgIdx1, int& bestImgIdx2)
{ {
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{ {
Dist dist; Dist dist;
@ -418,10 +417,10 @@ namespace cv { namespace gpu { namespace bf_knnmatch
{ {
T val; T val;
ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -458,11 +457,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
} }
} }
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -489,13 +488,13 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -506,11 +505,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -548,13 +547,13 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -565,18 +564,18 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match // Match
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train, typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2, float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2) int& bestImgIdx1, int& bestImgIdx2)
{ {
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{ {
Dist dist; Dist dist;
@ -592,10 +591,10 @@ namespace cv { namespace gpu { namespace bf_knnmatch
{ {
T val; T val;
ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -632,11 +631,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
} }
} }
} }
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -663,13 +662,13 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
} }
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance, const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -680,11 +679,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -722,13 +721,13 @@ namespace cv { namespace gpu { namespace bf_knnmatch
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
} }
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance, const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -739,16 +738,16 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// knnMatch 2 dispatcher // knnMatch 2 dispatcher
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Db& trainIdx, const DevMem2Db& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream); matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
@ -773,13 +772,13 @@ namespace cv { namespace gpu { namespace bf_knnmatch
{ {
match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream); match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
} }
} }
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream); matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
@ -804,14 +803,14 @@ namespace cv { namespace gpu { namespace bf_knnmatch
{ {
match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream); match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
} }
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel // Calc distance kernel
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist) __global__ void calcDistanceUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -829,8 +828,8 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (loadX < query.cols) if (loadX < query.cols)
{ {
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
} }
else else
{ {
@ -856,11 +855,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
allDist.ptr(queryIdx)[trainIdx] = distVal; allDist.ptr(queryIdx)[trainIdx] = distVal;
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
@ -871,11 +870,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist) __global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
@ -892,8 +891,8 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (loadX < query.cols) if (loadX < query.cols)
{ {
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
} }
else else
{ {
@ -919,11 +918,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
allDist.ptr(queryIdx)[trainIdx] = distVal; allDist.ptr(queryIdx)[trainIdx] = distVal;
} }
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
@ -934,16 +933,16 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Calc Distance dispatcher // Calc Distance dispatcher
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Df& allDist, const DevMem2Df& allDist,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (query.cols <= 64) if (query.cols <= 64)
{ {
calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
@ -968,14 +967,14 @@ namespace cv { namespace gpu { namespace bf_knnmatch
{ {
calcDistance<16, Dist>(query, train, mask, allDist, stream); calcDistance<16, Dist>(query, train, mask, allDist, stream);
} }
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// find knn match kernel // find knn match kernel
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance) __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
{ {
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
__shared__ float s_dist[SMEM_SIZE]; __shared__ float s_dist[SMEM_SIZE];
__shared__ int s_trainIdx[SMEM_SIZE]; __shared__ int s_trainIdx[SMEM_SIZE];
@ -1012,11 +1011,11 @@ namespace cv { namespace gpu { namespace bf_knnmatch
distance.ptr(queryIdx)[i] = dist; distance.ptr(queryIdx)[i] = dist;
} }
} }
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, 1, 1); const dim3 block(BLOCK_SIZE, 1, 1);
const dim3 grid(trainIdx.rows, 1, 1); const dim3 grid(trainIdx.rows, 1, 1);
@ -1028,21 +1027,21 @@ namespace cv { namespace gpu { namespace bf_knnmatch
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream)
{ {
findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream); findKnnMatch<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist, stream);
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// knn match Dispatcher // knn match Dispatcher
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask, void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask& mask,
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (k == 2) if (k == 2)
{ {
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream); match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
@ -1052,108 +1051,111 @@ namespace cv { namespace gpu { namespace bf_knnmatch
calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream); calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
} }
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// knn match caller // knn match caller
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else else
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
} }
template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else else
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
} }
//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask,
const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else else
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
} }
template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else else
match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
} }
template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2L1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
//template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2L1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2L1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2L1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2L1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2L1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else else
match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
} }
//template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2L2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2L2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else else
match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
} }
template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2Hamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2Hamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2Hamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); //template void match2Hamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); template void match2Hamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream);
}}}
} // namespace bf_knnmatch
END_OPENCV_DEVICE_NAMESPACE

View File

@ -45,17 +45,16 @@
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_match namespace bf_match {
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
{ {
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
{
s_distance += threadIdx.y * BLOCK_SIZE; s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
@ -65,11 +64,11 @@ namespace cv { namespace gpu { namespace bf_match
__syncthreads(); __syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>()); reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
{ {
s_distance += threadIdx.y * BLOCK_SIZE; s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE;
@ -81,27 +80,27 @@ namespace cv { namespace gpu { namespace bf_match
__syncthreads(); __syncthreads();
reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>()); reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match Unrolled Cached // Match Unrolled Cached
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query) __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
{ {
#pragma unroll #pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{ {
const int loadX = threadIdx.x + i * BLOCK_SIZE; const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0; s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
}
} }
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train, typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx) float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{ {
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{ {
Dist dist; Dist dist;
@ -117,7 +116,7 @@ namespace cv { namespace gpu { namespace bf_match
{ {
T val; T val;
ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -141,11 +140,11 @@ namespace cv { namespace gpu { namespace bf_match
bestTrainIdx = trainIdx; bestTrainIdx = trainIdx;
} }
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -172,13 +171,13 @@ namespace cv { namespace gpu { namespace bf_match
bestTrainIdx[queryIdx] = myBestTrainIdx; bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance; bestDistance[queryIdx] = myBestDistance;
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -189,12 +188,12 @@ namespace cv { namespace gpu { namespace bf_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, __global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance) int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -231,13 +230,13 @@ namespace cv { namespace gpu { namespace bf_match
bestImgIdx[queryIdx] = myBestImgIdx; bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance; bestDistance[queryIdx] = myBestDistance;
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -248,16 +247,16 @@ namespace cv { namespace gpu { namespace bf_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match Unrolled // Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, __device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train, typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx) float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{ {
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{ {
Dist dist; Dist dist;
@ -274,10 +273,10 @@ namespace cv { namespace gpu { namespace bf_match
{ {
T val; T val;
ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -301,11 +300,11 @@ namespace cv { namespace gpu { namespace bf_match
bestTrainIdx = trainIdx; bestTrainIdx = trainIdx;
} }
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -330,13 +329,13 @@ namespace cv { namespace gpu { namespace bf_match
bestTrainIdx[queryIdx] = myBestTrainIdx; bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance; bestDistance[queryIdx] = myBestDistance;
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -347,12 +346,12 @@ namespace cv { namespace gpu { namespace bf_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, __global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance) int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -387,13 +386,13 @@ namespace cv { namespace gpu { namespace bf_match
bestImgIdx[queryIdx] = myBestImgIdx; bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance; bestDistance[queryIdx] = myBestDistance;
} }
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -404,16 +403,16 @@ namespace cv { namespace gpu { namespace bf_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match // Match
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask, __device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train, typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx) float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{ {
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{ {
Dist dist; Dist dist;
@ -429,10 +428,10 @@ namespace cv { namespace gpu { namespace bf_match
{ {
T val; T val;
ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -456,11 +455,11 @@ namespace cv { namespace gpu { namespace bf_match
bestTrainIdx = trainIdx; bestTrainIdx = trainIdx;
} }
} }
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -485,13 +484,13 @@ namespace cv { namespace gpu { namespace bf_match
bestTrainIdx[queryIdx] = myBestTrainIdx; bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance; bestDistance[queryIdx] = myBestDistance;
} }
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -502,12 +501,12 @@ namespace cv { namespace gpu { namespace bf_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, __global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance) int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
@ -541,13 +540,13 @@ namespace cv { namespace gpu { namespace bf_match
bestImgIdx[queryIdx] = myBestImgIdx; bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance; bestDistance[queryIdx] = myBestDistance;
} }
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(query.rows, BLOCK_SIZE));
@ -558,16 +557,16 @@ namespace cv { namespace gpu { namespace bf_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match dispatcher // Match dispatcher
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
@ -592,13 +591,13 @@ namespace cv { namespace gpu { namespace bf_match
{ {
match<16, Dist>(query, train, mask, trainIdx, distance, stream); match<16, Dist>(query, train, mask, trainIdx, distance, stream);
} }
} }
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask, void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
@ -623,15 +622,15 @@ namespace cv { namespace gpu { namespace bf_match
{ {
match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
} }
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match caller // Match caller
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
@ -644,19 +643,19 @@ namespace cv { namespace gpu { namespace bf_match
trainIdx, distance, trainIdx, distance,
cc, stream); cc, stream);
} }
} }
template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
@ -669,19 +668,19 @@ namespace cv { namespace gpu { namespace bf_match
trainIdx, distance, trainIdx, distance,
cc, stream); cc, stream);
} }
} }
//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask), matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
@ -694,18 +693,18 @@ namespace cv { namespace gpu { namespace bf_match
trainIdx, distance, trainIdx, distance,
cc, stream); cc, stream);
} }
} }
template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
{ {
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
@ -718,19 +717,19 @@ namespace cv { namespace gpu { namespace bf_match
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); cc, stream);
} }
} }
template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
{ {
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
@ -743,19 +742,19 @@ namespace cv { namespace gpu { namespace bf_match
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); cc, stream);
} }
} }
//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (masks.data) if (masks.data)
{ {
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
@ -768,11 +767,14 @@ namespace cv { namespace gpu { namespace bf_match
trainIdx, imgIdx, distance, trainIdx, imgIdx, distance,
cc, stream); cc, stream);
} }
} }
template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream);
}}}
} // namespace bf_match
END_OPENCV_DEVICE_NAMESPACE

View File

@ -45,18 +45,17 @@
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_radius_match namespace bf_radius_match {
{
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask> ///////////////////////////////////////////////////////////////////////////////
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask, // Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
extern __shared__ int smem[]; extern __shared__ int smem[];
@ -81,10 +80,10 @@ namespace cv { namespace gpu { namespace bf_radius_match
{ {
T val; T val;
ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -111,12 +110,12 @@ namespace cv { namespace gpu { namespace bf_radius_match
} }
#endif #endif
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream) const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
@ -128,13 +127,13 @@ namespace cv { namespace gpu { namespace bf_radius_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
@ -160,15 +159,15 @@ namespace cv { namespace gpu { namespace bf_radius_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match // Match
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask, __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
extern __shared__ int smem[]; extern __shared__ int smem[];
@ -192,10 +191,10 @@ namespace cv { namespace gpu { namespace bf_radius_match
{ {
T val; T val;
ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
ForceGlob<T>::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
} }
@ -222,13 +221,13 @@ namespace cv { namespace gpu { namespace bf_radius_match
} }
#endif #endif
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
@ -240,13 +239,13 @@ namespace cv { namespace gpu { namespace bf_radius_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <int BLOCK_SIZE, typename Dist, typename T> template <int BLOCK_SIZE, typename Dist, typename T>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream) cudaStream_t stream)
{ {
const dim3 block(BLOCK_SIZE, BLOCK_SIZE); const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
@ -272,16 +271,16 @@ namespace cv { namespace gpu { namespace bf_radius_match
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Match dispatcher // Match dispatcher
template <typename Dist, typename T, typename Mask> template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
@ -306,13 +305,13 @@ namespace cv { namespace gpu { namespace bf_radius_match
{ {
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
} }
} }
template <typename Dist, typename T> template <typename Dist, typename T>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (query.cols <= 64) if (query.cols <= 64)
{ {
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
@ -337,15 +336,15 @@ namespace cv { namespace gpu { namespace bf_radius_match
{ {
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
} }
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Radius Match caller // Radius Match caller
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
@ -358,19 +357,19 @@ namespace cv { namespace gpu { namespace bf_radius_match
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); cc, stream);
} }
} }
template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
@ -383,19 +382,19 @@ namespace cv { namespace gpu { namespace bf_radius_match
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); cc, stream);
} }
} }
//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
if (mask.data) if (mask.data)
{ {
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
@ -408,58 +407,61 @@ namespace cv { namespace gpu { namespace bf_radius_match
trainIdx, distance, nMatches, trainIdx, distance, nMatches,
cc, stream); cc, stream);
} }
} }
template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches, trainIdx, imgIdx, distance, nMatches,
cc, stream); cc, stream);
} }
template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches, trainIdx, imgIdx, distance, nMatches,
cc, stream); cc, stream);
} }
//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchL2_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream) int cc, cudaStream_t stream)
{ {
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches, trainIdx, imgIdx, distance, nMatches,
cc, stream); cc, stream);
} }
template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream); template void matchHamming_gpu<int >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
}}}
} // namespace bf_radius_match
END_OPENCV_DEVICE_NAMESPACE

View File

@ -43,65 +43,58 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace bf_krnls namespace bilateral_filter {
__constant__ float* ctable_color;
__constant__ float* ctable_space;
__constant__ size_t ctable_space_step;
__constant__ int cndisp;
__constant__ int cradius;
__constant__ short cedge_disc;
__constant__ short cmax_disc;
void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
{ {
__constant__ float* ctable_color; cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
__constant__ float* ctable_space; cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
__constant__ size_t ctable_space_step; size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
__constant__ int cndisp; cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
__constant__ int cradius; cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
__constant__ short cedge_disc; cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
__constant__ short cmax_disc; cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
} }
namespace cv { namespace gpu { namespace bf template <int channels>
struct DistRgbMax
{ {
void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
{
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
size_t table_space_step = table_space.step / sizeof(float);
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
}
}}}
namespace bf_krnls
{
template <int channels>
struct DistRgbMax
{
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{ {
uchar x = abs(a[0] - b[0]); uchar x = ::abs(a[0] - b[0]);
uchar y = abs(a[1] - b[1]); uchar y = ::abs(a[1] - b[1]);
uchar z = abs(a[2] - b[2]); uchar z = ::abs(a[2] - b[2]);
return (max(max(x, y), z)); return (::max(::max(x, y), z));
} }
}; };
template <> template <>
struct DistRgbMax<1> struct DistRgbMax<1>
{ {
static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
{ {
return abs(a[0] - b[0]); return ::abs(a[0] - b[0]);
} }
}; };
template <int channels, typename T> template <int channels, typename T>
__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
{ {
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
@ -115,12 +108,12 @@ namespace bf_krnls
dp[3] = *(disp + (y+1) * disp_step + x + 0); dp[3] = *(disp + (y+1) * disp_step + x + 0);
dp[4] = *(disp + (y ) * disp_step + x + 1); dp[4] = *(disp + (y ) * disp_step + x + 1);
if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
{ {
const int ymin = max(0, y - cradius); const int ymin = ::max(0, y - cradius);
const int xmin = max(0, x - cradius); const int xmin = ::max(0, x - cradius);
const int ymax = min(h - 1, y + cradius); const int ymax = ::min(h - 1, y + cradius);
const int xmax = min(w - 1, x + cradius); const int xmax = ::min(w - 1, x + cradius);
float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
@ -136,15 +129,15 @@ namespace bf_krnls
uchar dist_rgb = DistRgbMax<channels>::calc(in, ic); uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
const T disp_reg = disp_y[xi]; const T disp_reg = disp_y[xi];
cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
} }
} }
@ -180,14 +173,11 @@ namespace bf_krnls
*(disp + y * disp_step + x) = dp[id]; *(disp + y * disp_step + x) = dp[id];
} }
} }
}
} }
namespace cv { namespace gpu { namespace bf template <typename T>
void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{ {
template <typename T>
void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)
{
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x << 1); grid.x = divUp(disp.cols, threads.x << 1);
@ -198,18 +188,20 @@ namespace cv { namespace gpu { namespace bf
case 1: case 1:
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
break; break;
case 3: case 3:
for (int i = 0; i < iters; ++i) for (int i = 0; i < iters; ++i)
{ {
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
break; break;
@ -219,15 +211,18 @@ namespace cv { namespace gpu { namespace bf
if (stream != 0) if (stream != 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream) void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{ {
bilateral_filter_caller(disp, img, channels, iters, stream); bilateral_filter_caller(disp, img, channels, iters, stream);
} }
void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream) void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
{ {
bilateral_filter_caller(disp, img, channels, iters, stream); bilateral_filter_caller(disp, img, channels, iters, stream);
} }
}}}
} // namespace bilateral_filter
END_OPENCV_DEVICE_NAMESPACE

View File

@ -42,15 +42,14 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
namespace cv { namespace gpu namespace blend {
{
template <typename T> template <typename T>
__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2, __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result) const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -63,12 +62,11 @@ namespace cv { namespace gpu
T p2 = img2.ptr(y)[x]; T p2 = img2.ptr(y)[x];
result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f); result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
} }
} }
template <typename T> template <typename T>
void blendLinearCaller(int rows, int cols, int cn, const PtrStep<T>& img1, const PtrStep<T>& img2, void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
const PtrStepf& weights1, const PtrStepf& weights2, PtrStep<T> result, cudaStream_t stream) {
{
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
@ -77,17 +75,15 @@ namespace cv { namespace gpu
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
template void blendLinearCaller<uchar>(int, int, int, const PtrStep<uchar>&, const PtrStep<uchar>&, template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
const PtrStepf&, const PtrStepf&, PtrStep<uchar>, cudaStream_t stream); template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
template void blendLinearCaller<float>(int, int, int, const PtrStep<float>&, const PtrStep<float>&,
const PtrStepf&, const PtrStepf&, PtrStep<float>, cudaStream_t stream);
__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2, __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
const PtrStepf weights1, const PtrStepf weights2, PtrStepb result) const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -103,12 +99,10 @@ namespace cv { namespace gpu
((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2, ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2); p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
} }
} }
void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, {
const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream)
{
dim3 threads(16, 16); dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
@ -117,6 +111,8 @@ namespace cv { namespace gpu
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
}} } // namespace blend
END_OPENCV_DEVICE_NAMESPACE

View File

@ -44,14 +44,12 @@
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200 #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
using namespace cv::gpu::device; namespace transform_points
namespace cv { namespace gpu
{ {
namespace transform_points
{
__constant__ float3 crot0; __constant__ float3 crot0;
__constant__ float3 crot1; __constant__ float3 crot1;
__constant__ float3 crot2; __constant__ float3 crot2;
@ -76,13 +74,12 @@ namespace cv { namespace gpu
cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
transform(src, dst, TransformOp(), stream); OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
} }
} // namespace transform_points } // namespace transform_points
namespace project_points
namespace project_points {
{
__constant__ float3 crot0; __constant__ float3 crot0;
__constant__ float3 crot1; __constant__ float3 crot1;
__constant__ float3 crot2; __constant__ float3 crot2;
@ -116,13 +113,12 @@ namespace cv { namespace gpu
cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3)); cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3)); cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
transform(src, dst, ProjectOp(), stream); OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
} }
} // namespace project_points } // namespace project_points
namespace solve_pnp_ransac
namespace solve_pnp_ransac {
{
__constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3]; __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
__constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS]; __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
@ -191,6 +187,6 @@ namespace cv { namespace gpu
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
} // namespace solvepnp_ransac } // namespace solvepnp_ransac
}} // namespace cv { namespace gpu END_OPENCV_DEVICE_NAMESPACE

View File

@ -44,12 +44,12 @@
#include <algorithm> #include <algorithm>
#include "internal_shared.hpp" #include "internal_shared.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
namespace cv { namespace gpu { namespace canny namespace canny {
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{ {
__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{
__shared__ int smem[16][18]; __shared__ int smem[16][18];
const int j = blockIdx.x * blockDim.x + threadIdx.x; const int j = blockIdx.x * blockDim.x + threadIdx.x;
@ -60,8 +60,8 @@ namespace cv { namespace gpu { namespace canny
smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)]; smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)]; smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
} }
__syncthreads(); __syncthreads();
@ -71,10 +71,10 @@ namespace cv { namespace gpu { namespace canny
dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
} }
} }
} }
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{ {
dim3 block(16, 16, 1); dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
@ -82,26 +82,26 @@ namespace cv { namespace gpu { namespace canny
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
struct L1 struct L1
{ {
static __device__ __forceinline__ float calc(int x, int y) static __device__ __forceinline__ float calc(int x, int y)
{ {
return abs(x) + abs(y); return ::abs(x) + ::abs(y);
} }
}; };
struct L2 struct L2
{ {
static __device__ __forceinline__ float calc(int x, int y) static __device__ __forceinline__ float calc(int x, int y)
{ {
return sqrtf(x * x + y * y); return ::sqrtf(x * x + y * y);
} }
}; };
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{ {
__shared__ int sdx[18][16]; __shared__ int sdx[18][16];
__shared__ int sdy[18][16]; __shared__ int sdy[18][16];
@ -114,11 +114,11 @@ namespace cv { namespace gpu { namespace canny
sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
if (threadIdx.y == 0) if (threadIdx.y == 0)
{ {
sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j]; sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j]; sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j]; sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j]; sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
} }
__syncthreads(); __syncthreads();
@ -133,10 +133,10 @@ namespace cv { namespace gpu { namespace canny
mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
} }
} }
} }
void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{ {
dim3 block(16, 16, 1); dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
@ -148,19 +148,19 @@ namespace cv { namespace gpu { namespace canny
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{ {
const int j = blockIdx.x * blockDim.x + threadIdx.x; const int j = blockIdx.x * blockDim.x + threadIdx.x;
const int i = blockIdx.y * blockDim.y + threadIdx.y; const int i = blockIdx.y * blockDim.y + threadIdx.y;
if (i < rows && j < cols) if (i < rows && j < cols)
mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
} }
void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
{ {
dim3 block(16, 16, 1); dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
@ -172,15 +172,15 @@ namespace cv { namespace gpu { namespace canny
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
#define CANNY_SHIFT 15 #define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5) #define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{ {
__shared__ float smem[18][18]; __shared__ float smem[18][18];
const int j = blockIdx.x * 16 + threadIdx.x; const int j = blockIdx.x * 16 + threadIdx.x;
@ -205,8 +205,8 @@ namespace cv { namespace gpu { namespace canny
const int s = (x ^ y) < 0 ? -1 : 1; const int s = (x ^ y) < 0 ? -1 : 1;
const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
x = abs(x); x = ::abs(x);
y = abs(y); y = ::abs(y);
// 0 - the pixel can not belong to an edge // 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge // 1 - the pixel might belong to an edge
@ -239,13 +239,13 @@ namespace cv { namespace gpu { namespace canny
map.ptr(i + 1)[j + 1] = edge_type; map.ptr(i + 1)[j + 1] = edge_type;
} }
} }
#undef CANNY_SHIFT #undef CANNY_SHIFT
#undef TG22 #undef TG22
void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{ {
dim3 block(16, 16, 1); dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
@ -253,14 +253,14 @@ namespace cv { namespace gpu { namespace canny
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
__device__ unsigned int counter = 0; __device__ unsigned int counter = 0;
__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
{ {
#if __CUDA_ARCH__ >= 120 #if __CUDA_ARCH__ >= 120
__shared__ int smem[18][18]; __shared__ int smem[18][18];
@ -335,10 +335,10 @@ namespace cv { namespace gpu { namespace canny
} }
#endif #endif
} }
void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
{ {
dim3 block(16, 16, 1); dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
@ -346,13 +346,13 @@ namespace cv { namespace gpu { namespace canny
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
{ {
#if __CUDA_ARCH__ >= 120 #if __CUDA_ARCH__ >= 120
const int stack_size = 512; const int stack_size = 512;
@ -392,7 +392,7 @@ namespace cv { namespace gpu { namespace canny
while (s_counter > 0 && s_counter <= stack_size - blockDim.x) while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
{ {
const int subTaskIdx = threadIdx.x >> 3; const int subTaskIdx = threadIdx.x >> 3;
const int portion = min(s_counter, blockDim.x >> 3); const int portion = ::min(s_counter, blockDim.x >> 3);
pos.x = pos.y = 0; pos.x = pos.y = 0;
@ -441,12 +441,12 @@ namespace cv { namespace gpu { namespace canny
} }
#endif #endif
} }
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
{ {
void* counter_ptr; void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") ); cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
unsigned int count; unsigned int count;
cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
@ -466,19 +466,19 @@ namespace cv { namespace gpu { namespace canny
std::swap(st1, st2); std::swap(st1, st2);
} }
} }
__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
{ {
const int j = blockIdx.x * 16 + threadIdx.x; const int j = blockIdx.x * 16 + threadIdx.x;
const int i = blockIdx.y * 16 + threadIdx.y; const int i = blockIdx.y * 16 + threadIdx.y;
if (i < rows && j < cols) if (i < rows && j < cols)
dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
} }
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
{ {
dim3 block(16, 16, 1); dim3 block(16, 16, 1);
dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
@ -486,5 +486,8 @@ namespace cv { namespace gpu { namespace canny
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
} }
}}}
} // namespace canny
END_OPENCV_DEVICE_NAMESPACE

View File

@ -44,336 +44,337 @@
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/color.hpp" #include "opencv2/gpu/device/color.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
{ {
template <> struct TransformFunctorTraits<bgra_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_rgba_traits<uchar>::functor_type>
{
enum { smart_block_dim_x = 8 }; enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgra_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<bgra_to_bgr555_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<rgba_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<rgba_to_bgr555_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgra_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<bgra_to_bgr565_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<rgba_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<rgba_to_bgr565_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgr555_to_bgra_traits::functor_type> : DefaultTransformFunctorTraits<bgr555_to_bgra_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgr555_to_rgba_traits::functor_type> : DefaultTransformFunctorTraits<bgr555_to_rgba_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgr565_to_bgra_traits::functor_type> : DefaultTransformFunctorTraits<bgr565_to_bgra_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgr565_to_rgba_traits::functor_type> : DefaultTransformFunctorTraits<bgr565_to_rgba_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<gray_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<gray_to_bgra_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<gray_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<gray_to_bgr555_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
{ {
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<gray_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<gray_to_bgr565_traits::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
{ {
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgra_to_yuv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_yuv4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<rgba_to_yuv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_yuv4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<yuv4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<yuv4_to_bgra_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<yuv4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<yuv4_to_rgba_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgra_to_YCrCb4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_YCrCb4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<rgba_to_YCrCb4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_YCrCb4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<YCrCb4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<YCrCb4_to_bgra_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<YCrCb4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<YCrCb4_to_rgba_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgra_to_xyz4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_xyz4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<rgba_to_xyz4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_xyz4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<xyz4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<xyz4_to_bgra_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<xyz4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<xyz4_to_rgba_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgra_to_hsv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_hsv4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<rgba_to_hsv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_hsv4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<hsv4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hsv4_to_bgra_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<hsv4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hsv4_to_rgba_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<bgra_to_hls4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_hls4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<rgba_to_hls4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_hls4_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<hls4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hls4_to_bgra_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
template <> struct TransformFunctorTraits<hls4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hls4_to_rgba_traits<uchar>::functor_type> DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
{ {
enum { smart_block_dim_y = 8 }; enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 }; enum { smart_shift = 4 };
}; };
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \ #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \ void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
{ \ { \
traits::functor_type functor = traits::create_functor(); \ traits::functor_type functor = traits::create_functor(); \
typedef typename traits::functor_type::argument_type src_t; \ typedef typename traits::functor_type::argument_type src_t; \
typedef typename traits::functor_type::result_type dst_t; \ typedef typename traits::functor_type::result_type dst_t; \
transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \ OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
} }
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \ #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits) OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \ #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \ OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \ OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \ #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \ OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \ OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \ OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>) OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra) OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
}}}
END_OPENCV_DEVICE_NAMESPACE

View File

@ -47,8 +47,7 @@
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
#define MAX_KERNEL_SIZE 16 #define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16 #define BLOCK_DIM_X 16
@ -56,18 +55,18 @@ using namespace cv::gpu::device;
#define RESULT_STEPS 8 #define RESULT_STEPS 8
#define HALO_STEPS 1 #define HALO_STEPS 1
namespace filter_column namespace column_filter {
__constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadKernel(const float kernel[], int ksize)
{ {
__constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadKernel(const float kernel[], int ksize)
{
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
} }
template <int KERNEL_SIZE, typename T, typename D, typename B> template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b) __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
__shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1]; __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
@ -112,29 +111,26 @@ namespace filter_column
dst.ptr(dstY)[x] = saturate_cast<D>(sum); dst.ptr(dstY)[x] = saturate_cast<D>(sum);
} }
} }
}
} }
namespace cv { namespace gpu { namespace filters template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{ {
template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y)); const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
B<T> b(src.rows); B<T> b(src.rows);
filter_column::linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b); linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, typename D> template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] = static const caller_t callers[5][17] =
{ {
@ -235,16 +231,19 @@ namespace cv { namespace gpu { namespace filters
} }
}; };
filter_column::loadKernel(kernel, ksize); loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
} }
template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); //template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); //template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float , int >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}}
} // namespace column_filter
END_OPENCV_DEVICE_NAMESPACE

View File

@ -43,22 +43,21 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc namespace copy_make_border {
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
{ {
template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows) if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = src(y - top, x - left); dst.ptr(y)[x] = src(y - top, x - left);
} }
template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
{ {
static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left,
const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream) const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
{ {
@ -74,11 +73,11 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode,
const T* borderValue, cudaStream_t stream) const T* borderValue, cudaStream_t stream)
{ {
typedef typename TypeVec<T, cn>::vec_type vec_type; typedef typename TypeVec<T, cn>::vec_type vec_type;
typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
@ -93,35 +92,38 @@ namespace cv { namespace gpu { namespace imgproc
}; };
callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream); callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
} }
template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream); //template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream); template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
}}}
} // namespace copy_make_border
END_OPENCV_DEVICE_NAMESPACE

File diff suppressed because it is too large Load Diff

View File

@ -45,9 +45,7 @@
#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
#define UINT_BITS 32U #define UINT_BITS 32U
@ -67,9 +65,9 @@ using namespace cv::gpu::device;
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120) #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
namespace cv { namespace gpu { namespace histograms namespace hist {
{
#if (!USE_SMEM_ATOMICS) #if (!USE_SMEM_ATOMICS)
#define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U ) #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
@ -84,7 +82,7 @@ namespace cv { namespace gpu { namespace histograms
} while (s_WarpHist[data] != count); } while (s_WarpHist[data] != count);
} }
#else #else
#define TAG_MASK 0xFFFFFFFFU #define TAG_MASK 0xFFFFFFFFU
@ -93,20 +91,20 @@ namespace cv { namespace gpu { namespace histograms
atomicAdd(s_WarpHist + data, 1); atomicAdd(s_WarpHist + data, 1);
} }
#endif #endif
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols) __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
{ {
uint x = pos_x << 2; uint x = pos_x << 2;
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag); if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag); if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag); if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag); if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
} }
__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols) __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
{ {
//Per-warp subhistogram storage //Per-warp subhistogram storage
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY]; __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT; uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
@ -140,17 +138,17 @@ namespace cv { namespace gpu { namespace histograms
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum; d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
} }
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge histogram256() output // Merge histogram256() output
// Run one threadblock per bin; each threadblock adds up the same bin counter // Run one threadblock per bin; each threadblock adds up the same bin counter
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256 // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
// takes only a fraction of total processing time // takes only a fraction of total processing time
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram) __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
{ {
uint sum = 0; uint sum = 0;
#pragma unroll #pragma unroll
@ -169,10 +167,10 @@ namespace cv { namespace gpu { namespace histograms
if(threadIdx.x == 0) if(threadIdx.x == 0)
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]); d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
} }
void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream) void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
{ {
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>( histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
DevMem2D_<uint>(src), DevMem2D_<uint>(src),
buf, buf,
@ -187,12 +185,12 @@ namespace cv { namespace gpu { namespace histograms
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
__constant__ int c_lut[256]; __constant__ int c_lut[256];
__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst) __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -202,19 +200,22 @@ namespace cv { namespace gpu { namespace histograms
const int lut = c_lut[val]; const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut); dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
} }
} }
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream) void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
{ {
dim3 block(16, 16); dim3 block(16, 16);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y)); dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
cudaSafeCall( cudaMemcpyToSymbol(cv::gpu::histograms::c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) ); cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
equalizeHist<<<grid, block, 0, stream>>>(src, dst); equalizeHist<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}}}
} // namespace hist
END_OPENCV_DEVICE_NAMESPACE

View File

@ -42,13 +42,15 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
// Other values are not supported // Other values are not supported
#define CELL_WIDTH 8 #define CELL_WIDTH 8
#define CELL_HEIGHT 8 #define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2 #define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2 #define CELLS_PER_BLOCK_Y 2
namespace cv { namespace gpu { namespace hog { namespace hog {
__constant__ int cnbins; __constant__ int cnbins;
__constant__ int cblock_stride_x; __constant__ int cblock_stride_x;
@ -83,23 +85,23 @@ int power_2up(unsigned int n)
void set_up_constants(int nbins, int block_stride_x, int block_stride_y, void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y) int nblocks_win_x, int nblocks_win_y)
{ {
uploadConstant("cv::gpu::hog::cnbins", nbins); cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) );
uploadConstant("cv::gpu::hog::cblock_stride_x", block_stride_x); cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) );
uploadConstant("cv::gpu::hog::cblock_stride_y", block_stride_y); cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) );
uploadConstant("cv::gpu::hog::cnblocks_win_x", nblocks_win_x); cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );
uploadConstant("cv::gpu::hog::cnblocks_win_y", nblocks_win_y); cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) );
int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
uploadConstant("cv::gpu::hog::cblock_hist_size", block_hist_size); cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) );
int block_hist_size_2up = power_2up(block_hist_size); int block_hist_size_2up = power_2up(block_hist_size);
uploadConstant("cv::gpu::hog::cblock_hist_size_2up", block_hist_size_2up); cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );
int descr_width = nblocks_win_x * block_hist_size; int descr_width = nblocks_win_x * block_hist_size;
uploadConstant("cv::gpu::hog::cdescr_width", descr_width); cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );
int descr_size = descr_width * nblocks_win_y; int descr_size = descr_width * nblocks_win_y;
uploadConstant("cv::gpu::hog::cdescr_size", descr_size); cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
} }
@ -153,10 +155,10 @@ __global__ void compute_hists_kernel_many_blocks(const int img_block_width, cons
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y); int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
int dist_center_x = dist_x - 4 * (1 - 2 * cell_x); int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
float gaussian = expf(-(dist_center_y * dist_center_y + float gaussian = ::expf(-(dist_center_y * dist_center_y +
dist_center_x * dist_center_x) * scale); dist_center_x * dist_center_x) * scale);
float interp_weight = (8.f - fabs(dist_y + 0.5f)) * float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) *
(8.f - fabs(dist_x + 0.5f)) / 64.f; (8.f - ::fabs(dist_x + 0.5f)) / 64.f;
hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x; hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y; hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
@ -273,15 +275,15 @@ __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
__syncthreads(); __syncthreads();
float sum = reduce_smem<nthreads>(squares); float sum = reduce_smem<nthreads>(squares);
float scale = 1.0f / (sqrtf(sum) + 0.1f * block_hist_size); float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);
elem = min(elem * scale, threshold); elem = ::min(elem * scale, threshold);
__syncthreads(); __syncthreads();
squares[threadIdx.x] = elem * elem; squares[threadIdx.x] = elem * elem;
__syncthreads(); __syncthreads();
sum = reduce_smem<nthreads>(squares); sum = reduce_smem<nthreads>(squares);
scale = 1.0f / (sqrtf(sum) + 1e-3f); scale = 1.0f / (::sqrtf(sum) + 1e-3f);
if (threadIdx.x < block_hist_size) if (threadIdx.x < block_hist_size)
hist[0] = elem * scale; hist[0] = elem * scale;
@ -533,7 +535,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
val = row[max(x - 1, 1)]; val = row[::max(x - 1, 1)];
sh_row[0] = val.x; sh_row[0] = val.x;
sh_row[(nthreads + 2)] = val.y; sh_row[(nthreads + 2)] = val.y;
sh_row[2 * (nthreads + 2)] = val.z; sh_row[2 * (nthreads + 2)] = val.z;
@ -541,7 +543,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
if (threadIdx.x == blockDim.x - 1) if (threadIdx.x == blockDim.x - 1)
{ {
val = row[min(x + 1, width - 2)]; val = row[::min(x + 1, width - 2)];
sh_row[blockDim.x + 1] = val.x; sh_row[blockDim.x + 1] = val.x;
sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y; sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z; sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
@ -561,7 +563,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
float3 dx; float3 dx;
if (correct_gamma) if (correct_gamma)
dx = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z)); dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
else else
dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
@ -576,7 +578,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
b = make_float3(val.x, val.y, val.z); b = make_float3(val.x, val.y, val.z);
if (correct_gamma) if (correct_gamma)
dy = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z)); dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
else else
dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z); dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
} }
@ -601,10 +603,10 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
mag0 = mag1; mag0 = mag1;
} }
mag0 = sqrtf(mag0); mag0 = ::sqrtf(mag0);
float ang = (atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f; float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)floorf(ang); int hidx = (int)::floorf(ang);
ang -= hidx; ang -= hidx;
hidx = (hidx + cnbins) % cnbins; hidx = (hidx + cnbins) % cnbins;
@ -648,10 +650,10 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
sh_row[threadIdx.x + 1] = row[width - 2]; sh_row[threadIdx.x + 1] = row[width - 2];
if (threadIdx.x == 0) if (threadIdx.x == 0)
sh_row[0] = row[max(x - 1, 1)]; sh_row[0] = row[::max(x - 1, 1)];
if (threadIdx.x == blockDim.x - 1) if (threadIdx.x == blockDim.x - 1)
sh_row[blockDim.x + 1] = row[min(x + 1, width - 2)]; sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];
__syncthreads(); __syncthreads();
if (x < width) if (x < width)
@ -659,7 +661,7 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
float dx; float dx;
if (correct_gamma) if (correct_gamma)
dx = sqrtf(sh_row[threadIdx.x + 2]) - sqrtf(sh_row[threadIdx.x]); dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
else else
dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x]; dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];
@ -669,14 +671,14 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x]; float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x]; float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
if (correct_gamma) if (correct_gamma)
dy = sqrtf(a) - sqrtf(b); dy = ::sqrtf(a) - ::sqrtf(b);
else else
dy = a - b; dy = a - b;
} }
float mag = sqrtf(dx * dx + dy * dy); float mag = ::sqrtf(dx * dx + dy * dy);
float ang = (atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f; float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
int hidx = (int)floorf(ang); int hidx = (int)::floorf(ang);
ang -= hidx; ang -= hidx;
hidx = (hidx + cnbins) % cnbins; hidx = (hidx + cnbins) % cnbins;
@ -768,4 +770,6 @@ static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); } void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); } void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
}}} } // namespace hog
END_OPENCV_DEVICE_NAMESPACE

View File

@ -46,19 +46,18 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace imgproc {
namespace cv { namespace gpu { namespace imgproc
{
/////////////////////////////////// MeanShiftfiltering /////////////////////////////////////////////// /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
texture<uchar4, 2> tex_meanshift; texture<uchar4, 2> tex_meanshift;
__device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
size_t out_step, int cols, int rows, size_t out_step, int cols, int rows,
int sp, int sr, int maxIter, float eps) int sp, int sr, int maxIter, float eps)
{ {
int isr2 = sr*sr; int isr2 = sr*sr;
uchar4 c = tex2D(tex_meanshift, x0, y0 ); uchar4 c = tex2D(tex_meanshift, x0, y0 );
@ -105,7 +104,7 @@ namespace cv { namespace gpu { namespace imgproc
int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z); int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
bool stopFlag = (x0 == x1 && y0 == y1) || (abs(x1-x0) + abs(y1-y0) + norm2 <= eps); bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
x0 = x1; y0 = y1; x0 = x1; y0 = y1;
c.x = s0; c.y = s1; c.z = s2; c.x = s0; c.y = s1; c.z = s2;
@ -118,22 +117,22 @@ namespace cv { namespace gpu { namespace imgproc
*(uchar4*)(out + base) = c; *(uchar4*)(out + base) = c;
return make_short2((short)x0, (short)y0); return make_short2((short)x0, (short)y0);
} }
__global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps ) __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
{ {
int x0 = blockIdx.x * blockDim.x + threadIdx.x; int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y; int y0 = blockIdx.y * blockDim.y + threadIdx.y;
if( x0 < cols && y0 < rows ) if( x0 < cols && y0 < rows )
do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps); do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
} }
__global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
unsigned char* outsp, size_t outspstep, unsigned char* outsp, size_t outspstep,
int cols, int rows, int cols, int rows,
int sp, int sr, int maxIter, float eps) int sp, int sr, int maxIter, float eps)
{ {
int x0 = blockIdx.x * blockDim.x + threadIdx.x; int x0 = blockIdx.x * blockDim.x + threadIdx.x;
int y0 = blockIdx.y * blockDim.y + threadIdx.y; int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -142,10 +141,10 @@ namespace cv { namespace gpu { namespace imgproc
int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short); int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
*(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps); *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
} }
} }
void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream) void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
{ {
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
@ -161,10 +160,10 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
//cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
{ {
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
grid.x = divUp(src.cols, threads.x); grid.x = divUp(src.cols, threads.x);
@ -180,13 +179,13 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
//cudaSafeCall( cudaUnbindTexture( tex_meanshift ) ); //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
} }
/////////////////////////////////// drawColorDisp /////////////////////////////////////////////// /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
template <typename T> template <typename T>
__device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1) __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
{ {
unsigned int H = ((ndisp-d) * 240)/ndisp; unsigned int H = ((ndisp-d) * 240)/ndisp;
unsigned int hi = (H/60) % 6; unsigned int hi = (H/60) % 6;
@ -238,16 +237,16 @@ namespace cv { namespace gpu { namespace imgproc
res.y = p; res.y = p;
res.z = V; res.z = V;
} }
const unsigned int b = (unsigned int)(max(0.f, min (res.x, 1.f)) * 255.f); const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
const unsigned int g = (unsigned int)(max(0.f, min (res.y, 1.f)) * 255.f); const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
const unsigned int r = (unsigned int)(max(0.f, min (res.z, 1.f)) * 255.f); const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
const unsigned int a = 255U; const unsigned int a = 255U;
return (a << 24) + (r << 16) + (g << 8) + b; return (a << 24) + (r << 16) + (g << 8) + b;
} }
__global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp) __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
{ {
const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2; const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -264,10 +263,10 @@ namespace cv { namespace gpu { namespace imgproc
uint4* line = (uint4*)(out_image + y * out_step); uint4* line = (uint4*)(out_image + y * out_step);
line[x >> 2] = res; line[x >> 2] = res;
} }
} }
__global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp) __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
{ {
const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1; const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -282,11 +281,11 @@ namespace cv { namespace gpu { namespace imgproc
uint2* line = (uint2*)(out_image + y * out_step); uint2* line = (uint2*)(out_image + y * out_step);
line[x >> 1] = res; line[x >> 1] = res;
} }
} }
void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream) void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
{ {
dim3 threads(16, 16, 1); dim3 threads(16, 16, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x << 2); grid.x = divUp(src.cols, threads.x << 2);
@ -297,10 +296,10 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream) void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(src.cols, threads.x << 1); grid.x = divUp(src.cols, threads.x << 1);
@ -311,15 +310,15 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////// reprojectImageTo3D /////////////////////////////////////////////// /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
__constant__ float cq[16]; __constant__ float cq[16];
template <typename T> template <typename T>
__global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols) __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -345,11 +344,11 @@ namespace cv { namespace gpu { namespace imgproc
*(float4*)(xyzw + xyzw_step * y + (x * 4)) = v; *(float4*)(xyzw + xyzw_step * y + (x * 4)) = v;
} }
} }
template <typename T> template <typename T>
inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
grid.x = divUp(disp.cols, threads.x); grid.x = divUp(disp.cols, threads.x);
@ -362,23 +361,23 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
{ {
reprojectImageTo3D_caller(disp, xyzw, q, stream); reprojectImageTo3D_caller(disp, xyzw, q, stream);
} }
void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream) void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
{ {
reprojectImageTo3D_caller(disp, xyzw, q, stream); reprojectImageTo3D_caller(disp, xyzw, q, stream);
} }
//////////////////////////////////////// Extract Cov Data //////////////////////////////////////////////// //////////////////////////////////////// Extract Cov Data ////////////////////////////////////////////////
__global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx, __global__ void extractCovData_kernel(const int cols, const int rows, const PtrStepf Dx,
const PtrStepf Dy, PtrStepf dst) const PtrStepf Dy, PtrStepf dst)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -391,10 +390,10 @@ namespace cv { namespace gpu { namespace imgproc
dst.ptr(y + rows)[x] = dx * dy; dst.ptr(y + rows)[x] = dx * dy;
dst.ptr(y + (rows << 1))[x] = dy * dy; dst.ptr(y + (rows << 1))[x] = dy * dy;
} }
} }
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream) void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream)
{ {
dim3 threads(32, 8); dim3 threads(32, 8);
dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y)); dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
@ -403,16 +402,16 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////// Corner Harris ///////////////////////////////////////////////// /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
texture<float, 2> harrisDxTex; texture<float, 2> harrisDxTex;
texture<float, 2> harrisDyTex; texture<float, 2> harrisDyTex;
__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k, __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
PtrStepb dst) PtrStepb dst)
{ {
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -441,12 +440,12 @@ namespace cv { namespace gpu { namespace imgproc
((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c); ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
} }
} }
template <typename BR, typename BC> template <typename BR, typename BC>
__global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k, __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
PtrStepb dst, BR border_row, BC border_col) PtrStepb dst, BR border_row, BC border_col)
{ {
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -477,11 +476,11 @@ namespace cv { namespace gpu { namespace imgproc
((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c); ((float*)dst.ptr(y))[x] = a * c - b * b - k * (a + c) * (a + c);
} }
} }
void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
int border_type, cudaStream_t stream) int border_type, cudaStream_t stream)
{ {
const int rows = Dx.rows; const int rows = Dx.rows;
const int cols = Dx.cols; const int cols = Dx.cols;
@ -517,16 +516,16 @@ namespace cv { namespace gpu { namespace imgproc
//cudaSafeCall(cudaUnbindTexture(harrisDxTex)); //cudaSafeCall(cudaUnbindTexture(harrisDxTex));
//cudaSafeCall(cudaUnbindTexture(harrisDyTex)); //cudaSafeCall(cudaUnbindTexture(harrisDyTex));
} }
/////////////////////////////////////////// Corner Min Eigen Val ///////////////////////////////////////////////// /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
texture<float, 2> minEigenValDxTex; texture<float, 2> minEigenValDxTex;
texture<float, 2> minEigenValDyTex; texture<float, 2> minEigenValDyTex;
__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size,
PtrStepb dst) PtrStepb dst)
{ {
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -557,13 +556,13 @@ namespace cv { namespace gpu { namespace imgproc
c *= 0.5f; c *= 0.5f;
((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b); ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
} }
} }
template <typename BR, typename BC> template <typename BR, typename BC>
__global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size,
PtrStepb dst, BR border_row, BC border_col) PtrStepb dst, BR border_row, BC border_col)
{ {
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -596,11 +595,11 @@ namespace cv { namespace gpu { namespace imgproc
c *= 0.5f; c *= 0.5f;
((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b); ((float*)dst.ptr(y))[x] = (a + c) - sqrtf((a - c) * (a - c) + b * b);
} }
} }
void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst,
int border_type, cudaStream_t stream) int border_type, cudaStream_t stream)
{ {
const int rows = Dx.rows; const int rows = Dx.rows;
const int cols = Dx.cols; const int cols = Dx.cols;
@ -636,12 +635,12 @@ namespace cv { namespace gpu { namespace imgproc
//cudaSafeCall(cudaUnbindTexture(minEigenValDxTex)); //cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
//cudaSafeCall(cudaUnbindTexture(minEigenValDyTex)); //cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
} }
////////////////////////////// Column Sum ////////////////////////////////////// ////////////////////////////// Column Sum //////////////////////////////////////
__global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst) __global__ void column_sumKernel_32F(int cols, int rows, const PtrStepb src, const PtrStepb dst)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < cols) if (x < cols)
@ -658,11 +657,11 @@ namespace cv { namespace gpu { namespace imgproc
dst_data += dst.step; dst_data += dst.step;
} }
} }
} }
void columnSum_32F(const DevMem2Db src, const DevMem2Db dst) void columnSum_32F(const DevMem2Db src, const DevMem2Db dst)
{ {
dim3 threads(256); dim3 threads(256);
dim3 grid(divUp(src.cols, threads.x)); dim3 grid(divUp(src.cols, threads.x));
@ -670,14 +669,14 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// mulSpectrums // mulSpectrums
__global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c) __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -685,11 +684,11 @@ namespace cv { namespace gpu { namespace imgproc
{ {
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
} }
} }
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream) void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{ {
dim3 threads(256); dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -698,14 +697,14 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// mulSpectrums_CONJ // mulSpectrums_CONJ
__global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c) __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -713,11 +712,11 @@ namespace cv { namespace gpu { namespace imgproc
{ {
c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
} }
} }
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream) void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{ {
dim3 threads(256); dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -726,14 +725,14 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums // mulAndScaleSpectrums
__global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c) __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -742,11 +741,11 @@ namespace cv { namespace gpu { namespace imgproc
cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]); cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
} }
} }
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream) void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{ {
dim3 threads(256); dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -755,14 +754,14 @@ namespace cv { namespace gpu { namespace imgproc
if (stream) if (stream)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums_CONJ // mulAndScaleSpectrums_CONJ
__global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c) __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -771,11 +770,11 @@ namespace cv { namespace gpu { namespace imgproc
cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x])); cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
} }
} }
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream) void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream)
{ {
dim3 threads(256); dim3 threads(256);
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y)); dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
@ -784,26 +783,26 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// buildWarpMaps // buildWarpMaps
// TODO use intrinsics like __sinf and so on // TODO use intrinsics like __sinf and so on
namespace build_warp_maps namespace build_warp_maps
{ {
__constant__ float ck_rinv[9]; __constant__ float ck_rinv[9];
__constant__ float cr_kinv[9]; __constant__ float cr_kinv[9];
__constant__ float ct[3]; __constant__ float ct[3];
__constant__ float cscale; __constant__ float cscale;
} }
class PlaneMapper class PlaneMapper
{ {
public: public:
static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y) static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
{ {
using namespace build_warp_maps; using namespace build_warp_maps;
@ -819,20 +818,20 @@ namespace cv { namespace gpu { namespace imgproc
x /= z; x /= z;
y /= z; y /= z;
} }
}; };
class CylindricalMapper class CylindricalMapper
{ {
public: public:
static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y) static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
{ {
using namespace build_warp_maps; using namespace build_warp_maps;
u /= cscale; u /= cscale;
float x_ = sinf(u); float x_ = ::sinf(u);
float y_ = v / cscale; float y_ = v / cscale;
float z_ = cosf(u); float z_ = ::cosf(u);
float z; float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_; x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
@ -842,12 +841,12 @@ namespace cv { namespace gpu { namespace imgproc
if (z > 0) { x /= z; y /= z; } if (z > 0) { x /= z; y /= z; }
else x = y = -1; else x = y = -1;
} }
}; };
class SphericalMapper class SphericalMapper
{ {
public: public:
static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y) static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
{ {
using namespace build_warp_maps; using namespace build_warp_maps;
@ -855,10 +854,10 @@ namespace cv { namespace gpu { namespace imgproc
v /= cscale; v /= cscale;
u /= cscale; u /= cscale;
float sinv = sinf(v); float sinv = ::sinf(v);
float x_ = sinv * sinf(u); float x_ = sinv * ::sinf(u);
float y_ = -cosf(v); float y_ = -::cosf(v);
float z_ = sinv * cosf(u); float z_ = sinv * ::cosf(u);
float z; float z;
x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_; x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
@ -868,13 +867,13 @@ namespace cv { namespace gpu { namespace imgproc
if (z > 0) { x /= z; y /= z; } if (z > 0) { x /= z; y /= z; }
else x = y = -1; else x = y = -1;
} }
}; };
template <typename Mapper> template <typename Mapper>
__global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows, __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
PtrStepf map_x, PtrStepf map_y) PtrStepf map_x, PtrStepf map_y)
{ {
int du = blockIdx.x * blockDim.x + threadIdx.x; int du = blockIdx.x * blockDim.x + threadIdx.x;
int dv = blockIdx.y * blockDim.y + threadIdx.y; int dv = blockIdx.y * blockDim.y + threadIdx.y;
if (du < cols && dv < rows) if (du < cols && dv < rows)
@ -886,13 +885,13 @@ namespace cv { namespace gpu { namespace imgproc
map_x.ptr(dv)[du] = x; map_x.ptr(dv)[du] = x;
map_y.ptr(dv)[du] = y; map_y.ptr(dv)[du] = y;
} }
} }
void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y, void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
const float k_rinv[9], const float r_kinv[9], const float t[3], const float k_rinv[9], const float r_kinv[9], const float t[3],
float scale, cudaStream_t stream) float scale, cudaStream_t stream)
{ {
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
@ -908,13 +907,13 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall(cudaGetLastError()); cudaSafeCall(cudaGetLastError());
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y, void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
const float k_rinv[9], const float r_kinv[9], float scale, const float k_rinv[9], const float r_kinv[9], float scale,
cudaStream_t stream) cudaStream_t stream)
{ {
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
@ -929,13 +928,13 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall(cudaGetLastError()); cudaSafeCall(cudaGetLastError());
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y, void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
const float k_rinv[9], const float r_kinv[9], float scale, const float k_rinv[9], const float r_kinv[9], float scale,
cudaStream_t stream) cudaStream_t stream)
{ {
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float))); cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
@ -950,18 +949,18 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall(cudaGetLastError()); cudaSafeCall(cudaGetLastError());
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// convolve // convolve
#define CONVOLVE_MAX_KERNEL_SIZE 17 #define CONVOLVE_MAX_KERNEL_SIZE 17
__constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE]; __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];
__global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight) __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)
{ {
__shared__ float smem[16 + 2 * 8][16 + 2 * 8]; __shared__ float smem[16 + 2 * 8][16 + 2 * 8];
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -973,7 +972,7 @@ namespace cv { namespace gpu { namespace imgproc
// 0 | 0 0 | 0 // 0 | 0 0 | 0
// ----------- // -----------
// 0 | 0 0 | 0 // 0 | 0 0 | 0
smem[threadIdx.y][threadIdx.x] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(max(x - 8, 0), src.cols - 1)]; smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
// 0 | 0 x | x // 0 | 0 x | x
// ----------- // -----------
@ -981,7 +980,7 @@ namespace cv { namespace gpu { namespace imgproc
// 0 | 0 0 | 0 // 0 | 0 0 | 0
// ----------- // -----------
// 0 | 0 0 | 0 // 0 | 0 0 | 0
smem[threadIdx.y][threadIdx.x + 16] = src.ptr(min(max(y - 8, 0), src.rows - 1))[min(x + 8, src.cols - 1)]; smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];
// 0 | 0 0 | 0 // 0 | 0 0 | 0
// ----------- // -----------
@ -989,7 +988,7 @@ namespace cv { namespace gpu { namespace imgproc
// x | x 0 | 0 // x | x 0 | 0
// ----------- // -----------
// x | x 0 | 0 // x | x 0 | 0
smem[threadIdx.y + 16][threadIdx.x] = src.ptr(min(y + 8, src.rows - 1))[min(max(x - 8, 0), src.cols - 1)]; smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
// 0 | 0 0 | 0 // 0 | 0 0 | 0
// ----------- // -----------
@ -997,7 +996,7 @@ namespace cv { namespace gpu { namespace imgproc
// 0 | 0 x | x // 0 | 0 x | x
// ----------- // -----------
// 0 | 0 x | x // 0 | 0 x | x
smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(min(y + 8, src.rows - 1))[min(x + 8, src.cols - 1)]; smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];
__syncthreads(); __syncthreads();
@ -1015,10 +1014,10 @@ namespace cv { namespace gpu { namespace imgproc
dst.ptr(y)[x] = res; dst.ptr(y)[x] = res;
} }
} }
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream) void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
{ {
cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
const dim3 block(16, 16); const dim3 block(16, 16);
@ -1029,9 +1028,8 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
}}}
} // namespace imgproc
END_OPENCV_DEVICE_NAMESPACE

View File

@ -43,11 +43,15 @@
#ifndef __OPENCV_internal_shared_HPP__ #ifndef __OPENCV_internal_shared_HPP__
#define __OPENCV_internal_shared_HPP__ #define __OPENCV_internal_shared_HPP__
#include <cuda_runtime.h>
#include <npp.h>
#include "NPP_staging.hpp"
#include "opencv2/gpu/devmem2d.hpp" #include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp" #include "safe_call.hpp"
#include "cuda_runtime.h"
#include "npp.h" #ifndef CV_PI
#include "NPP_staging.hpp" #define CV_PI 3.1415926535897932384626433832795f
#endif
#ifndef CV_PI_F #ifndef CV_PI_F
#ifndef CV_PI #ifndef CV_PI
@ -57,15 +61,28 @@
#endif #endif
#endif #endif
namespace cv #define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device {
{ #define END_OPENCV_DEVICE_NAMESPACE }}}
namespace gpu #define OPENCV_DEVICE_NAMESPACE ::cv::gpu::device
{ #define OPENCV_DEVICE_NAMESPACE_ ::cv::gpu::device::
typedef unsigned char uchar;
typedef signed char schar;
typedef unsigned short ushort;
typedef unsigned int uint;
BEGIN_OPENCV_DEVICE_NAMESPACE
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef signed char schar;
typedef unsigned int uint;
template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
END_OPENCV_DEVICE_NAMESPACE
namespace cv { namespace gpu
{
enum enum
{ {
BORDER_REFLECT101_GPU = 0, BORDER_REFLECT101_GPU = 0,
@ -81,7 +98,7 @@ namespace cv
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; } static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
template<class T> static inline void uploadConstant(const char* name, const T& value) /*template<class T> static inline void uploadConstant(const char* name, const T& value)
{ {
cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) );
} }
@ -89,77 +106,78 @@ namespace cv
template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream)
{ {
cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) );
} } */
template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img/*, bool normalized = false, //template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)
enum cudaTextureFilterMode filterMode = cudaFilterModePoint, enum cudaTextureAddressMode addrMode = cudaAddressModeClamp*/) //{
{ // //!!!! const_cast is disabled!
//!!!! const_cast is disabled! // //!!!! Please use constructor of 'class texture' instead.
//!!!! Please use constructor of 'class texture' instead. //
// //textureReference* tex;
// //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) );
// //tex->normalized = normalized;
// //tex->filterMode = filterMode;
// //tex->addressMode[0] = addrMode;
// //tex->addressMode[1] = addrMode;
//
// const textureReference* tex;
// cudaSafeCall( cudaGetTextureReference(&tex, name) );
//
// cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
// cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
//}
//textureReference* tex; //static inline void unbindTexture(const char *name)
//cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); //{
//tex->normalized = normalized; // const textureReference* tex;
//tex->filterMode = filterMode; // cudaSafeCall( cudaGetTextureReference(&tex, name) );
//tex->addressMode[0] = addrMode; // cudaSafeCall( cudaUnbindTexture(tex) );
//tex->addressMode[1] = addrMode; //}
const textureReference* tex;
cudaSafeCall( cudaGetTextureReference(&tex, name) );
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
static inline void unbindTexture(const char *name) //class TextureBinder
{ //{
const textureReference* tex; //public:
cudaSafeCall( cudaGetTextureReference(&tex, name) ); // TextureBinder() : tex_(0) {}
cudaSafeCall( cudaUnbindTexture(tex) ); // template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
} // {
// bind(tex, img);
class TextureBinder // }
{ // template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
public: // {
TextureBinder() : tex_(0) {} // bind(tex_name, img);
template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0) // }
{ // ~TextureBinder() { unbind(); }
bind(tex, img); //
} // template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0) // {
{ // unbind();
bind(tex_name, img); //
} // cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
~TextureBinder() { unbind(); } // cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
//
template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img) // tex_ = tex;
{ // }
unbind(); // template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
// {
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>(); // const textureReference* tex;
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) ); // cudaSafeCall( cudaGetTextureReference(&tex, tex_name) );
// bind(tex, img);
tex_ = tex; // }
} //
template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img) // void unbind()
{ // {
const textureReference* tex; // if (tex_)
cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); // {
bind(tex, img); // cudaUnbindTexture(tex_);
} // tex_ = 0;
// }
void unbind() // }
{ //
if (tex_) //private:
{ // const textureReference* tex_;
cudaUnbindTexture(tex_); //};
tex_ = 0;
}
}
private:
const textureReference* tex_;
};
class NppStreamHandler class NppStreamHandler
{ {
@ -195,8 +213,6 @@ namespace cv
private: private:
cudaStream_t oldStream; cudaStream_t oldStream;
}; };
} }}
}
#endif /* __OPENCV_internal_shared_HPP__ */ #endif /* __OPENCV_internal_shared_HPP__ */

View File

@ -43,10 +43,9 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc { namespace match_template {
__device__ __forceinline__ float sum(float v) { return v; } __device__ __forceinline__ float sum(float v) { return v; }
__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; } __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
@ -266,9 +265,9 @@ void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long
__device__ float normAcc(float num, float denum) __device__ float normAcc(float num, float denum)
{ {
if (fabs(num) < denum) if (::fabs(num) < denum)
return num / denum; return num / denum;
if (fabs(num) < denum * 1.125f) if (::fabs(num) < denum * 1.125f)
return num > 0 ? 1 : -1; return num > 0 ? 1 : -1;
return 0; return 0;
} }
@ -276,9 +275,9 @@ __device__ float normAcc(float num, float denum)
__device__ float normAcc_SQDIFF(float num, float denum) __device__ float normAcc_SQDIFF(float num, float denum)
{ {
if (fabs(num) < denum) if (::fabs(num) < denum)
return num / denum; return num / denum;
if (fabs(num) < denum * 1.125f) if (::fabs(num) < denum * 1.125f)
return num > 0 ? 1 : -1; return num > 0 ? 1 : -1;
return 1; return 1;
} }
@ -906,4 +905,7 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cu
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}}}
} //namespace match_template
END_OPENCV_DEVICE_NAMESPACE

View File

@ -42,50 +42,46 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
#ifndef CV_PI namespace mathfunc {
#define CV_PI 3.1415926535897932384626433832795f
#endif
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
// Cart <-> Polar // Cart <-> Polar
namespace cv { namespace gpu { namespace mathfunc struct Nothing
{ {
struct Nothing
{
static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float) static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
{ {
} }
}; };
struct Magnitude struct Magnitude
{ {
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float) static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{ {
dst[y * dst_step + x] = sqrtf(x_data * x_data + y_data * y_data); dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
} }
}; };
struct MagnitudeSqr struct MagnitudeSqr
{ {
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float) static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
{ {
dst[y * dst_step + x] = x_data * x_data + y_data * y_data; dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
} }
}; };
struct Atan2 struct Atan2
{ {
static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale) static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
{ {
float angle = atan2f(y_data, x_data); float angle = ::atan2f(y_data, x_data);
angle += (angle < 0) * 2.0 * CV_PI; angle += (angle < 0) * 2.0 * CV_PI;
dst[y * dst_step + x] = scale * angle; dst[y * dst_step + x] = scale * angle;
} }
}; };
template <typename Mag, typename Angle> template <typename Mag, typename Angle>
__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height) float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -97,26 +93,26 @@ namespace cv { namespace gpu { namespace mathfunc
Mag::calc(x, y, x_data, y_data, mag, mag_step, scale); Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
Angle::calc(x, y, x_data, y_data, angle, angle_step, scale); Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
} }
} }
struct NonEmptyMag struct NonEmptyMag
{ {
static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y) static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
{ {
return mag[y * mag_step + x]; return mag[y * mag_step + x];
} }
}; };
struct EmptyMag struct EmptyMag
{ {
static __device__ __forceinline__ float get(const float*, size_t, int, int) static __device__ __forceinline__ float get(const float*, size_t, int, int)
{ {
return 1.0f; return 1.0f;
} }
}; };
template <typename Mag> template <typename Mag>
__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale, __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height) float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -126,16 +122,16 @@ namespace cv { namespace gpu { namespace mathfunc
float angle_data = angle[y * angle_step + x]; float angle_data = angle[y * angle_step + x];
float sin_a, cos_a; float sin_a, cos_a;
sincosf(scale * angle_data, &sin_a, &cos_a); ::sincosf(scale * angle_data, &sin_a, &cos_a);
xptr[y * x_step + x] = mag_data * cos_a; xptr[y * x_step + x] = mag_data * cos_a;
yptr[y * y_step + x] = mag_data * sin_a; yptr[y * y_step + x] = mag_data * sin_a;
} }
} }
template <typename Mag, typename Angle> template <typename Mag, typename Angle>
void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -151,11 +147,11 @@ namespace cv { namespace gpu { namespace mathfunc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream) void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream); typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2][2][2] = static const caller_t callers[2][2][2] =
{ {
{ {
@ -181,11 +177,11 @@ namespace cv { namespace gpu { namespace mathfunc
}; };
callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream); callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
} }
template <typename Mag> template <typename Mag>
void polarToCart_caller(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream) void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -200,11 +196,11 @@ namespace cv { namespace gpu { namespace mathfunc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream) void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream); typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
static const caller_t callers[2] = static const caller_t callers[2] =
{ {
polarToCart_caller<NonEmptyMag>, polarToCart_caller<NonEmptyMag>,
@ -212,8 +208,8 @@ namespace cv { namespace gpu { namespace mathfunc
}; };
callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream); callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
} }
}}}
} // namespace mathfunc
END_OPENCV_DEVICE_NAMESPACE

View File

@ -45,24 +45,24 @@
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
namespace cv { namespace gpu { namespace device { BEGIN_OPENCV_DEVICE_NAMESPACE
template <typename T> struct shift_and_sizeof; template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; }; template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; }; template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; }; template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; }; template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; }; template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; }; template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; }; template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo ///////////////////////////////// ////////////////////////////////// CopyTo /////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
template<typename T> template<typename T>
__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels) __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
{ {
size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y; size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@ -72,12 +72,11 @@ namespace cv { namespace gpu { namespace device {
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x; size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
mat_dst[idx] = mat_src[idx]; mat_dst[idx] = mat_src[idx];
} }
} }
typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
template<typename T> template<typename T>
void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream) void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
{ {
dim3 threadsPerBlock(16,16, 1); dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
@ -87,10 +86,12 @@ namespace cv { namespace gpu { namespace device {
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
{
typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
{
static CopyToFunc tab[8] = static CopyToFunc tab[8] =
{ {
copy_to_with_mask_run<unsigned char>, copy_to_with_mask_run<unsigned char>,
@ -108,61 +109,61 @@ namespace cv { namespace gpu { namespace device {
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__); if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels, stream); func(mat_src, mat_dst, mask, channels, stream);
} }
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo ////////////////////////////////// ////////////////////////////////// SetTo //////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
__constant__ uchar scalar_8u[4]; __constant__ uchar scalar_8u[4];
__constant__ schar scalar_8s[4]; __constant__ schar scalar_8s[4];
__constant__ ushort scalar_16u[4]; __constant__ ushort scalar_16u[4];
__constant__ short scalar_16s[4]; __constant__ short scalar_16s[4];
__constant__ int scalar_32s[4]; __constant__ int scalar_32s[4];
__constant__ float scalar_32f[4]; __constant__ float scalar_32f[4];
__constant__ double scalar_64f[4]; __constant__ double scalar_64f[4];
template <typename T> __device__ __forceinline__ T readScalar(int i); template <typename T> __device__ __forceinline__ T readScalar(int i);
template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];} template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];} template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];} template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];} template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];} template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];} template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];} template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
void writeScalar(const uchar* vals) void writeScalar(const uchar* vals)
{ {
cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) ); cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
} }
void writeScalar(const schar* vals) void writeScalar(const schar* vals)
{ {
cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) ); cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
} }
void writeScalar(const ushort* vals) void writeScalar(const ushort* vals)
{ {
cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) ); cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
} }
void writeScalar(const short* vals) void writeScalar(const short* vals)
{ {
cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) ); cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
} }
void writeScalar(const int* vals) void writeScalar(const int* vals)
{ {
cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) ); cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
} }
void writeScalar(const float* vals) void writeScalar(const float* vals)
{ {
cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) ); cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
} }
void writeScalar(const double* vals) void writeScalar(const double* vals)
{ {
cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) ); cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
} }
template<typename T> template<typename T>
__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels) __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
{ {
size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y; size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@ -171,11 +172,11 @@ namespace cv { namespace gpu { namespace device {
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x; size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = readScalar<T>(x % channels); mat[idx] = readScalar<T>(x % channels);
} }
} }
template<typename T> template<typename T>
__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask) __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
{ {
size_t x = blockIdx.x * blockDim.x + threadIdx.x; size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y; size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@ -185,10 +186,10 @@ namespace cv { namespace gpu { namespace device {
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x; size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = readScalar<T>(x % channels); mat[idx] = readScalar<T>(x % channels);
} }
} }
template <typename T> template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream) void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
{ {
writeScalar(scalar); writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
@ -199,19 +200,19 @@ namespace cv { namespace gpu { namespace device {
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
template <typename T> template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream) void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
{ {
writeScalar(scalar); writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
@ -222,22 +223,22 @@ namespace cv { namespace gpu { namespace device {
if (stream == 0) if (stream == 0)
cudaSafeCall ( cudaDeviceSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream); template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream); template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream); template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream); template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream); template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream); template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
template <typename T, typename D> struct Convertor : unary_function<T, D> template <typename T, typename D> struct Convertor : unary_function<T, D>
{ {
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src) const __device__ __forceinline__ D operator()(const T& src) const
@ -246,10 +247,10 @@ namespace cv { namespace gpu { namespace device {
} }
const double alpha, beta; const double alpha, beta;
}; };
namespace detail namespace detail
{ {
template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F> template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
{ {
}; };
@ -290,24 +291,24 @@ namespace cv { namespace gpu { namespace device {
template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F> template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
{ {
}; };
} }
template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> > template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
{ {
}; };
template<typename T, typename D> template<typename T, typename D>
void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream) void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
{ {
cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta); Convertor<T, D> op(alpha, beta);
transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream); OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
} }
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta,
cudaStream_t stream = 0) cudaStream_t stream = 0)
{ {
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta,
cudaStream_t stream); cudaStream_t stream);
@ -342,5 +343,6 @@ namespace cv { namespace gpu { namespace device {
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(src, dst, alpha, beta, stream); func(src, dst, alpha, beta, stream);
} }
}}}
END_OPENCV_DEVICE_NAMESPACE

View File

@ -40,22 +40,20 @@
// //
//M*/ //M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "internal_shared.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc namespace matrix_reductions {
// Performs reduction in shared memory
template <int size, typename T>
__device__ void sumInSmem(volatile T* data, const uint tid)
{ {
// Performs reduction in shared memory
template <int size, typename T>
__device__ void sumInSmem(volatile T* data, const uint tid)
{
T sum = data[tid]; T sum = data[tid];
if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); } if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
@ -71,11 +69,10 @@ namespace cv { namespace gpu { namespace mathfunc
if (size >= 4) data[tid] = sum = sum + data[tid + 2]; if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1]; if (size >= 2) data[tid] = sum = sum + data[tid + 1];
} }
} }
struct Mask8U
struct Mask8U {
{
explicit Mask8U(PtrStepb mask): mask(mask) {} explicit Mask8U(PtrStepb mask): mask(mask) {}
__device__ __forceinline__ bool operator()(int y, int x) const __device__ __forceinline__ bool operator()(int y, int x) const
@ -84,35 +81,32 @@ namespace cv { namespace gpu { namespace mathfunc
} }
PtrStepb mask; PtrStepb mask;
}; };
struct MaskTrue
struct MaskTrue {
{
__device__ __forceinline__ bool operator()(int y, int x) const __device__ __forceinline__ bool operator()(int y, int x) const
{ {
return true; return true;
} }
}; };
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Min max // Min max
// To avoid shared bank conflicts we convert each value into value of // To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum) // appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {}; template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; }; template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<char> { typedef int best_type; }; template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; }; template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; }; template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; }; template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; }; template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; }; template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
namespace minmax
{
namespace minmax
{
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
@ -126,8 +120,8 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
threads = dim3(32, 8); threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x); grid.x = std::min(grid.x, threads.x);
grid.y = min(grid.y, threads.y); grid.y = std::min(grid.y, threads.y);
} }
@ -155,8 +149,8 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
__device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval) __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
{ {
minval[tid] = min(minval[tid], minval[tid + offset]); minval[tid] = ::min(minval[tid], minval[tid + offset]);
maxval[tid] = max(maxval[tid], maxval[tid + offset]); maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);
} }
@ -192,8 +186,8 @@ namespace cv { namespace gpu { namespace mathfunc
T mymin = numeric_limits<T>::max(); T mymin = numeric_limits<T>::max();
T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y) for (uint y = y0; y < y_end; y += blockDim.y)
{ {
const T* src_row = (const T*)src.ptr(y); const T* src_row = (const T*)src.ptr(y);
@ -202,8 +196,8 @@ namespace cv { namespace gpu { namespace mathfunc
T val = src_row[x]; T val = src_row[x];
if (mask(y, x)) if (mask(y, x))
{ {
mymin = min(mymin, val); mymin = ::min(mymin, val);
mymax = max(mymax, val); mymax = ::max(mymax, val);
} }
} }
} }
@ -220,7 +214,7 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
if (tid == 0) if (tid == 0)
@ -237,7 +231,7 @@ namespace cv { namespace gpu { namespace mathfunc
if (is_last) if (is_last)
{ {
uint idx = min(tid, gridDim.x * gridDim.y - 1); uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx]; sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx]; smaxval[tid] = maxval[idx];
@ -332,7 +326,7 @@ namespace cv { namespace gpu { namespace mathfunc
__shared__ best_type smaxval[nthreads]; __shared__ best_type smaxval[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, size - 1); uint idx = ::min(tid, size - 1);
sminval[tid] = minval[idx]; sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx]; smaxval[tid] = maxval[idx];
@ -410,14 +404,13 @@ namespace cv { namespace gpu { namespace mathfunc
template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb); template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);
} // namespace minmax
} // namespace minmax
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// minMaxLoc // minMaxLoc
namespace minmaxloc { namespace minmaxloc
{
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
@ -431,8 +424,8 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
threads = dim3(32, 8); threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x); grid.x = std::min(grid.x, threads.x);
grid.y = min(grid.y, threads.y); grid.y = std::min(grid.y, threads.y);
} }
@ -513,12 +506,11 @@ namespace cv { namespace gpu { namespace mathfunc
uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits<T>::max(); T mymin = numeric_limits<T>::max();
T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
numeric_limits<T>::min();
uint myminloc = 0; uint myminloc = 0;
uint mymaxloc = 0; uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows); uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols); uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y) for (uint y = y0; y < y_end; y += blockDim.y)
{ {
@ -542,7 +534,7 @@ namespace cv { namespace gpu { namespace mathfunc
findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid); findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
if (tid == 0) if (tid == 0)
@ -561,7 +553,7 @@ namespace cv { namespace gpu { namespace mathfunc
if (is_last) if (is_last)
{ {
uint idx = min(tid, gridDim.x * gridDim.y - 1); uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx]; sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx]; smaxval[tid] = maxval[idx];
@ -685,7 +677,7 @@ namespace cv { namespace gpu { namespace mathfunc
__shared__ uint smaxloc[nthreads]; __shared__ uint smaxloc[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x; uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, size - 1); uint idx = ::min(tid, size - 1);
sminval[tid] = minval[idx]; sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx]; smaxval[tid] = maxval[idx];
@ -787,15 +779,13 @@ namespace cv { namespace gpu { namespace mathfunc
template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
} // namespace minmaxloc
} // namespace minmaxloc
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero // countNonZero
namespace countnonzero namespace countnonzero
{ {
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
@ -805,8 +795,8 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
threads = dim3(32, 8); threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32)); grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x); grid.x = std::min(grid.x, threads.x);
grid.y = min(grid.y, threads.y); grid.y = std::min(grid.y, threads.y);
} }
@ -850,7 +840,7 @@ namespace cv { namespace gpu { namespace mathfunc
sumInSmem<nthreads, uint>(scount, tid); sumInSmem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
if (tid == 0) if (tid == 0)
@ -957,15 +947,14 @@ namespace cv { namespace gpu { namespace mathfunc
template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);
template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb); template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);
} // namespace countnonzero } // namespace countnonzero
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Sum // Sum
namespace sums
{
namespace sum
{
template <typename T> struct SumType {}; template <typename T> struct SumType {};
template <> struct SumType<uchar> { typedef uint R; }; template <> struct SumType<uchar> { typedef uint R; };
template <> struct SumType<char> { typedef int R; }; template <> struct SumType<char> { typedef int R; };
@ -979,7 +968,7 @@ namespace cv { namespace gpu { namespace mathfunc
struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } }; struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
template <typename R> template <typename R>
struct AbsOp { static __device__ __forceinline__ R call(R x) { return abs(x); } }; struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };
template <> template <>
struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } }; struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
@ -999,8 +988,8 @@ namespace cv { namespace gpu { namespace mathfunc
threads = dim3(threads_x, threads_y); threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y), grid = dim3(divUp(cols, threads.x * threads.y),
divUp(rows, threads.y * threads.x)); divUp(rows, threads.y * threads.x));
grid.x = min(grid.x, threads.x); grid.x = std::min(grid.x, threads.x);
grid.y = min(grid.y, threads.y); grid.y = std::min(grid.y, threads.y);
} }
@ -1044,7 +1033,7 @@ namespace cv { namespace gpu { namespace mathfunc
sumInSmem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
if (tid == 0) if (tid == 0)
@ -1125,7 +1114,7 @@ namespace cv { namespace gpu { namespace mathfunc
sumInSmem<nthreads, R>(smem, tid); sumInSmem<nthreads, R>(smem, tid);
sumInSmem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
if (tid == 0) if (tid == 0)
@ -1232,7 +1221,7 @@ namespace cv { namespace gpu { namespace mathfunc
sumInSmem<nthreads, R>(smem + nthreads, tid); sumInSmem<nthreads, R>(smem + nthreads, tid);
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
if (tid == 0) if (tid == 0)
@ -1349,7 +1338,7 @@ namespace cv { namespace gpu { namespace mathfunc
sumInSmem<nthreads, R>(smem + 2 * nthreads, tid); sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
sumInSmem<nthreads, R>(smem + 3 * nthreads, tid); sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
if (tid == 0) if (tid == 0)
@ -1437,13 +1426,9 @@ namespace cv { namespace gpu { namespace mathfunc
} }
} }
} // namespace sum
template <typename T> template <typename T>
void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
{ {
using namespace sums;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
@ -1515,7 +1500,6 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
{ {
using namespace sums;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
@ -1565,7 +1549,6 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
{ {
using namespace sums;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
@ -1637,7 +1620,6 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
{ {
using namespace sums;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
@ -1687,7 +1669,6 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
{ {
using namespace sums;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
@ -1759,7 +1740,6 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn) void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
{ {
using namespace sums;
typedef typename SumType<T>::R R; typedef typename SumType<T>::R R;
dim3 threads, grid; dim3 threads, grid;
@ -1804,12 +1784,13 @@ namespace cv { namespace gpu { namespace mathfunc
template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int); template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
} // namespace sum
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// reduce // reduce
template <typename S> struct SumReductor template <typename S> struct SumReductor
{ {
__device__ __forceinline__ S startValue() const __device__ __forceinline__ S startValue() const
{ {
return 0; return 0;
@ -1824,10 +1805,10 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
return r; return r;
} }
}; };
template <typename S> struct AvgReductor template <typename S> struct AvgReductor
{ {
__device__ __forceinline__ S startValue() const __device__ __forceinline__ S startValue() const
{ {
return 0; return 0;
@ -1842,10 +1823,10 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
return r / sz; return r / sz;
} }
}; };
template <typename S> struct MinReductor template <typename S> struct MinReductor
{ {
__device__ __forceinline__ S startValue() const __device__ __forceinline__ S startValue() const
{ {
return numeric_limits<S>::max(); return numeric_limits<S>::max();
@ -1864,10 +1845,10 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
return r; return r;
} }
}; };
template <typename S> struct MaxReductor template <typename S> struct MaxReductor
{ {
__device__ __forceinline__ S startValue() const __device__ __forceinline__ S startValue() const
{ {
return numeric_limits<S>::min(); return numeric_limits<S>::min();
@ -1886,10 +1867,10 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
return r; return r;
} }
}; };
template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op) template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
{ {
__shared__ S smem[16 * 16]; __shared__ S smem[16 * 16];
const int x = blockIdx.x * 16 + threadIdx.x; const int x = blockIdx.x * 16 + threadIdx.x;
@ -1917,10 +1898,10 @@ namespace cv { namespace gpu { namespace mathfunc
if (threadIdx.y == 0 && x < src.cols) if (threadIdx.y == 0 && x < src.cols)
dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows)); dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
} }
template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream) template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
{ {
const dim3 block(16, 16); const dim3 block(16, 16);
const dim3 grid(divUp(src.cols, block.x)); const dim3 grid(divUp(src.cols, block.x));
@ -1931,10 +1912,10 @@ namespace cv { namespace gpu { namespace mathfunc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream) template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
@ -1946,29 +1927,29 @@ namespace cv { namespace gpu { namespace mathfunc
}; };
callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream); callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
} }
template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op) template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
{ {
__shared__ S smem[256 * cn]; __shared__ S smem[256 * cn];
const int y = blockIdx.x; const int y = blockIdx.x;
@ -2054,10 +2035,10 @@ namespace cv { namespace gpu { namespace mathfunc
if (threadIdx.x < cn) if (threadIdx.x < cn)
dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols)); dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
} }
template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream) template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
{ {
const dim3 block(256); const dim3 block(256);
const dim3 grid(src.rows); const dim3 grid(src.rows);
@ -2068,10 +2049,10 @@ namespace cv { namespace gpu { namespace mathfunc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream) template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
static const caller_t callers[4][4] = static const caller_t callers[4][4] =
@ -2083,22 +2064,25 @@ namespace cv { namespace gpu { namespace mathfunc
}; };
callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream); callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
} }
template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
}}}
} // namespace mattrix_reductions
END_OPENCV_DEVICE_NAMESPACE

View File

@ -46,13 +46,12 @@
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc namespace pyr_down {
template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
{ {
template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -123,10 +122,10 @@ namespace cv { namespace gpu { namespace imgproc
if (dst_x < dst_cols) if (dst_x < dst_cols)
dst.ptr(y)[dst_x] = saturate_cast<T>(sum); dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
} }
} }
template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream) template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{ {
const dim3 block(256); const dim3 block(256);
const dim3 grid(divUp(src.cols, block.x), dst.rows); const dim3 grid(divUp(src.cols, block.x), dst.rows);
@ -137,10 +136,10 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream) template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
{ {
typedef typename TypeVec<T, cn>::vec_type type; typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
@ -151,35 +150,38 @@ namespace cv { namespace gpu { namespace imgproc
}; };
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream); callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
} }
template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
}}}
} // namespace pyr_down
END_OPENCV_DEVICE_NAMESPACE

View File

@ -46,13 +46,12 @@
#include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc namespace pyr_up {
template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
{ {
template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -118,10 +117,10 @@ namespace cv { namespace gpu { namespace imgproc
if (x < dst.cols && y < dst.rows) if (x < dst.cols && y < dst.rows)
dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum); dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
} }
template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream) template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
{ {
const dim3 block(16, 16); const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@ -132,10 +131,10 @@ namespace cv { namespace gpu { namespace imgproc
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream) template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
{ {
typedef typename TypeVec<T, cn>::vec_type type; typedef typename TypeVec<T, cn>::vec_type type;
typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
@ -146,35 +145,38 @@ namespace cv { namespace gpu { namespace imgproc
}; };
callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream); callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
} }
template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
}}}
} // namespace pyr_up
END_OPENCV_DEVICE_NAMESPACE

View File

@ -47,14 +47,12 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp" #include "opencv2/gpu/device/filters.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc namespace remap {
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
{ {
template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -65,10 +63,10 @@ namespace cv { namespace gpu { namespace imgproc
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo)); dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
} }
} }
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
{ {
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int) const float* borderValue, cudaStream_t stream, int)
{ {
@ -84,10 +82,10 @@ namespace cv { namespace gpu { namespace imgproc
remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst); remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
}; };
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
{ {
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int) static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
@ -104,7 +102,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \ #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
@ -124,7 +122,7 @@ namespace cv { namespace gpu { namespace imgproc
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \ typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(32, cc >= 20 ? 8 : 4); \ dim3 block(32, cc >= 20 ? 8 : 4); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_remap_ ## type , src); \ bindTexture(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \ tex_remap_ ## type ##_reader texSrc; \
B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \ B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \ BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
@ -140,7 +138,7 @@ namespace cv { namespace gpu { namespace imgproc
{ \ { \
dim3 block(32, 8); \ dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_remap_ ## type , src); \ bindTexture(&tex_remap_ ## type , src); \
tex_remap_ ## type ##_reader texSrc; \ tex_remap_ ## type ##_reader texSrc; \
Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \ Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \ remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
@ -149,34 +147,34 @@ namespace cv { namespace gpu { namespace imgproc
} \ } \
}; };
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar) OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4) OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort) OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4) OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short) OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4) OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float) OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2) //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4) OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
{ {
static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int cc) const float* borderValue, cudaStream_t stream, int cc)
{ {
@ -185,11 +183,11 @@ namespace cv { namespace gpu { namespace imgproc
else else
RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc); RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
} }
}; };
template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc) int borderMode, const float* borderValue, cudaStream_t stream, int cc)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst,
const float* borderValue, cudaStream_t stream, int cc); const float* borderValue, cudaStream_t stream, int cc);
@ -219,35 +217,38 @@ namespace cv { namespace gpu { namespace imgproc
}; };
callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc); callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
} }
template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); //template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
}}}
} // namespace remap
END_OPENCV_DEVICE_NAMESPACE

View File

@ -47,14 +47,12 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/filters.hpp" #include "opencv2/gpu/device/filters.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace imgproc namespace resize {
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{ {
template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -65,9 +63,9 @@ namespace cv { namespace gpu { namespace imgproc
dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo)); dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
} }
} }
template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst) template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -78,10 +76,10 @@ namespace cv { namespace gpu { namespace imgproc
dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo)); dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
} }
} }
template <template <typename> class Filter, typename T> struct ResizeDispatcherStream template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
{ {
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream) static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{ {
dim3 block(32, 8); dim3 block(32, 8);
@ -94,9 +92,9 @@ namespace cv { namespace gpu { namespace imgproc
resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst); resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
}; };
template <typename T> struct ResizeDispatcherStream<PointFilter, T> template <typename T> struct ResizeDispatcherStream<PointFilter, T>
{ {
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream) static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{ {
dim3 block(32, 8); dim3 block(32, 8);
@ -108,10 +106,10 @@ namespace cv { namespace gpu { namespace imgproc
resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst); resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
}; };
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
{ {
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst) static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{ {
dim3 block(32, 8); dim3 block(32, 8);
@ -126,9 +124,9 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T> template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
{ {
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst) static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
{ {
dim3 block(32, 8); dim3 block(32, 8);
@ -142,7 +140,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \ #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
@ -161,7 +159,7 @@ namespace cv { namespace gpu { namespace imgproc
{ \ { \
dim3 block(32, 8); \ dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_resize_ ## type , src); \ bindTexture(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \ tex_resize_ ## type ##_reader texSrc; \
Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \ Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
resize<<<grid, block>>>(filter_src, fx, fy, dst); \ resize<<<grid, block>>>(filter_src, fx, fy, dst); \
@ -175,7 +173,7 @@ namespace cv { namespace gpu { namespace imgproc
{ \ { \
dim3 block(32, 8); \ dim3 block(32, 8); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
TextureBinder texHandler(&tex_resize_ ## type , src); \ bindTexture(&tex_resize_ ## type , src); \
tex_resize_ ## type ##_reader texSrc; \ tex_resize_ ## type ##_reader texSrc; \
resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \ resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
cudaSafeCall( cudaGetLastError() ); \ cudaSafeCall( cudaGetLastError() ); \
@ -183,34 +181,34 @@ namespace cv { namespace gpu { namespace imgproc
} \ } \
}; };
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2) //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
template <template <typename> class Filter, typename T> struct ResizeDispatcher template <template <typename> class Filter, typename T> struct ResizeDispatcher
{ {
static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream) static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
{ {
if (stream == 0) if (stream == 0)
@ -218,10 +216,10 @@ namespace cv { namespace gpu { namespace imgproc
else else
ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream); ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
} }
}; };
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream) template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
static const caller_t callers[3] = static const caller_t callers[3] =
@ -230,35 +228,38 @@ namespace cv { namespace gpu { namespace imgproc
}; };
callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream); callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
} }
template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); //template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
}}}
} // namespace resize
END_OPENCV_DEVICE_NAMESPACE

View File

@ -47,8 +47,7 @@
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp" #include "opencv2/gpu/device/border_interpolate.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
#define MAX_KERNEL_SIZE 16 #define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16 #define BLOCK_DIM_X 16
@ -56,17 +55,17 @@ using namespace cv::gpu::device;
#define RESULT_STEPS 8 #define RESULT_STEPS 8
#define HALO_STEPS 1 #define HALO_STEPS 1
namespace filter_row namespace row_filter {
__constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadKernel(const float kernel[], int ksize)
{ {
__constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadKernel(const float kernel[], int ksize)
{
cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
} }
namespace detail namespace detail
{ {
template <typename T, size_t size> struct SmemType template <typename T, size_t size> struct SmemType
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
@ -76,16 +75,16 @@ namespace filter_row
{ {
typedef T smem_t; typedef T smem_t;
}; };
} }
template <typename T> struct SmemType template <typename T> struct SmemType
{ {
typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t; typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
}; };
template <int KERNEL_SIZE, typename T, typename D, typename B> template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b) __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
{ {
typedef typename SmemType<T>::smem_t smem_t; typedef typename SmemType<T>::smem_t smem_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
@ -133,31 +132,28 @@ namespace filter_row
dst_row[dstX] = saturate_cast<D>(sum); dst_row[dstX] = saturate_cast<D>(sum);
} }
} }
}
} }
namespace cv { namespace gpu { namespace filters template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{ {
template <int ksize, typename T, typename D, template<typename> class B> typedef typename SmemType<T>::smem_t smem_t;
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{
typedef typename filter_row::SmemType<T>::smem_t smem_t;
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
B<smem_t> b(src.cols); B<smem_t> b(src.cols);
filter_row::linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b); linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <typename T, typename D> template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
{ {
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream); typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
static const caller_t callers[5][17] = static const caller_t callers[5][17] =
{ {
@ -258,16 +254,19 @@ namespace cv { namespace gpu { namespace filters
} }
}; };
filter_row::loadKernel(kernel, ksize); loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
} }
template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); //template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); //template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<int , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}}
} // namespace row_filter
END_OPENCV_DEVICE_NAMESPACE

View File

@ -43,9 +43,9 @@
#ifndef __OPENCV_CUDA_SAFE_CALL_HPP__ #ifndef __OPENCV_CUDA_SAFE_CALL_HPP__
#define __OPENCV_CUDA_SAFE_CALL_HPP__ #define __OPENCV_CUDA_SAFE_CALL_HPP__
#include "cuda_runtime_api.h" #include <cuda_runtime_api.h>
#include "cufft.h" #include <cufft.h>
#include "cublas.h" #include <cublas.h>
#include "NCV.hpp" #include "NCV.hpp"
#if defined(__GNUC__) #if defined(__GNUC__)
@ -62,46 +62,44 @@
#define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__) #define cublasSafeCall(expr) ___cublasSafeCall(expr, __FILE__, __LINE__)
#endif #endif
namespace cv namespace cv { namespace gpu {
{
namespace gpu
{
void error(const char *error_string, const char *file, const int line, const char *func = "");
void nppError(int err, const char *file, const int line, const char *func = "");
void ncvError(int err, const char *file, const int line, const char *func = "");
void cufftError(int err, const char *file, const int line, const char *func = "");
void cublasError(int err, const char *file, const int line, const char *func = "");
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") void error(const char *error_string, const char *file, const int line, const char *func = "");
{ void nppError(int err, const char *file, const int line, const char *func = "");
void ncvError(int err, const char *file, const int line, const char *func = "");
void cufftError(int err, const char *file, const int line, const char *func = "");
void cublasError(int err, const char *file, const int line, const char *func = "");
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
if (cudaSuccess != err) if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func); cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
cv::gpu::nppError(err, file, line, func);
}
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (NCV_SUCCESS != err)
cv::gpu::ncvError(err, file, line, func);
}
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
{
if (CUFFT_SUCCESS != err)
cv::gpu::cufftError(err, file, line, func);
}
static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
{
if (CUBLAS_STATUS_SUCCESS != err)
cv::gpu::cublasError(err, file, line, func);
}
}
} }
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
cv::gpu::nppError(err, file, line, func);
}
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (NCV_SUCCESS != err)
cv::gpu::ncvError(err, file, line, func);
}
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
{
if (CUFFT_SUCCESS != err)
cv::gpu::cufftError(err, file, line, func);
}
static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
{
if (CUBLAS_STATUS_SUCCESS != err)
cv::gpu::cublasError(err, file, line, func);
}
}}
#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */ #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */

View File

@ -42,64 +42,66 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace split_merge { BEGIN_OPENCV_DEVICE_NAMESPACE
template <typename T, size_t elem_size = sizeof(T)> namespace split_merge {
struct TypeTraits
{ template <typename T, size_t elem_size = sizeof(T)>
struct TypeTraits
{
typedef T type; typedef T type;
typedef T type2; typedef T type2;
typedef T type3; typedef T type3;
typedef T type4; typedef T type4;
}; };
template <typename T> template <typename T>
struct TypeTraits<T, 1> struct TypeTraits<T, 1>
{ {
typedef char type; typedef char type;
typedef char2 type2; typedef char2 type2;
typedef char3 type3; typedef char3 type3;
typedef char4 type4; typedef char4 type4;
}; };
template <typename T> template <typename T>
struct TypeTraits<T, 2> struct TypeTraits<T, 2>
{ {
typedef short type; typedef short type;
typedef short2 type2; typedef short2 type2;
typedef short3 type3; typedef short3 type3;
typedef short4 type4; typedef short4 type4;
}; };
template <typename T> template <typename T>
struct TypeTraits<T, 4> struct TypeTraits<T, 4>
{ {
typedef int type; typedef int type;
typedef int2 type2; typedef int2 type2;
typedef int3 type3; typedef int3 type3;
typedef int4 type4; typedef int4 type4;
}; };
template <typename T> template <typename T>
struct TypeTraits<T, 8> struct TypeTraits<T, 8>
{ {
typedef double type; typedef double type;
typedef double2 type2; typedef double2 type2;
//typedef double3 type3; //typedef double3 type3;
//typedef double4 type3; //typedef double4 type3;
}; };
typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream); typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream); typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
//------------------------------------------------------------ //------------------------------------------------------------
// Merge // Merge
template <typename T> template <typename T>
__global__ void mergeC2_(const uchar* src0, size_t src0_step, __global__ void mergeC2_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
typedef typename TypeTraits<T>::type2 dst_type; typedef typename TypeTraits<T>::type2 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -116,15 +118,15 @@ namespace cv { namespace gpu { namespace split_merge {
dst_elem.y = src1_y[x]; dst_elem.y = src1_y[x];
dst_y[x] = dst_elem; dst_y[x] = dst_elem;
} }
} }
template <typename T> template <typename T>
__global__ void mergeC3_(const uchar* src0, size_t src0_step, __global__ void mergeC3_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
typedef typename TypeTraits<T>::type3 dst_type; typedef typename TypeTraits<T>::type3 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -143,15 +145,15 @@ namespace cv { namespace gpu { namespace split_merge {
dst_elem.z = src2_y[x]; dst_elem.z = src2_y[x];
dst_y[x] = dst_elem; dst_y[x] = dst_elem;
} }
} }
template <> template <>
__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -166,16 +168,16 @@ namespace cv { namespace gpu { namespace split_merge {
dst_y[3 * x + 1] = src1_y[x]; dst_y[3 * x + 1] = src1_y[x];
dst_y[3 * x + 2] = src2_y[x]; dst_y[3 * x + 2] = src2_y[x];
} }
} }
template <typename T> template <typename T>
__global__ void mergeC4_(const uchar* src0, size_t src0_step, __global__ void mergeC4_(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step, const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
typedef typename TypeTraits<T>::type4 dst_type; typedef typename TypeTraits<T>::type4 dst_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -196,16 +198,16 @@ namespace cv { namespace gpu { namespace split_merge {
dst_elem.w = src3_y[x]; dst_elem.w = src3_y[x];
dst_y[x] = dst_elem; dst_y[x] = dst_elem;
} }
} }
template <> template <>
__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
const uchar* src1, size_t src1_step, const uchar* src1, size_t src1_step,
const uchar* src2, size_t src2_step, const uchar* src2, size_t src2_step,
const uchar* src3, size_t src3_step, const uchar* src3, size_t src3_step,
int rows, int cols, uchar* dst, size_t dst_step) int rows, int cols, uchar* dst, size_t dst_step)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -220,12 +222,12 @@ namespace cv { namespace gpu { namespace split_merge {
dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]); dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]); dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
} }
} }
template <typename T> template <typename T>
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{ {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>( mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
@ -236,12 +238,12 @@ namespace cv { namespace gpu { namespace split_merge {
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
template <typename T> template <typename T>
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{ {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>( mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
@ -253,12 +255,12 @@ namespace cv { namespace gpu { namespace split_merge {
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
template <typename T> template <typename T>
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream) static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
{ {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y)); dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>( mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
@ -271,13 +273,13 @@ namespace cv { namespace gpu { namespace split_merge {
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
extern "C" void merge_caller(const DevMem2Db* src, DevMem2Db& dst, void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
int total_channels, size_t elem_size, int total_channels, size_t elem_size,
const cudaStream_t& stream) const cudaStream_t& stream)
{ {
static MergeFunction merge_func_tbl[] = static MergeFunction merge_func_tbl[] =
{ {
mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>, mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
@ -292,20 +294,20 @@ namespace cv { namespace gpu { namespace split_merge {
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__); cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
merge_func(src, dst, stream); merge_func(src, dst, stream);
} }
//------------------------------------------------------------ //------------------------------------------------------------
// Split // Split
template <typename T> template <typename T>
__global__ void splitC2_(const uchar* src, size_t src_step, __global__ void splitC2_(const uchar* src, size_t src_step,
int rows, int cols, int rows, int cols,
uchar* dst0, size_t dst0_step, uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step) uchar* dst1, size_t dst1_step)
{ {
typedef typename TypeTraits<T>::type2 src_type; typedef typename TypeTraits<T>::type2 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -321,16 +323,16 @@ namespace cv { namespace gpu { namespace split_merge {
dst0_y[x] = src_elem.x; dst0_y[x] = src_elem.x;
dst1_y[x] = src_elem.y; dst1_y[x] = src_elem.y;
} }
} }
template <typename T> template <typename T>
__global__ void splitC3_(const uchar* src, size_t src_step, __global__ void splitC3_(const uchar* src, size_t src_step,
int rows, int cols, int rows, int cols,
uchar* dst0, size_t dst0_step, uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step, uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step) uchar* dst2, size_t dst2_step)
{ {
typedef typename TypeTraits<T>::type3 src_type; typedef typename TypeTraits<T>::type3 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -348,16 +350,16 @@ namespace cv { namespace gpu { namespace split_merge {
dst1_y[x] = src_elem.y; dst1_y[x] = src_elem.y;
dst2_y[x] = src_elem.z; dst2_y[x] = src_elem.z;
} }
} }
template <> template <>
__global__ void splitC3_<double>( __global__ void splitC3_<double>(
const uchar* src, size_t src_step, int rows, int cols, const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step, uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step, uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step) uchar* dst2, size_t dst2_step)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -372,16 +374,16 @@ namespace cv { namespace gpu { namespace split_merge {
dst1_y[x] = src_y[3 * x + 1]; dst1_y[x] = src_y[3 * x + 1];
dst2_y[x] = src_y[3 * x + 2]; dst2_y[x] = src_y[3 * x + 2];
} }
} }
template <typename T> template <typename T>
__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols, __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step, uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step, uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step, uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step) uchar* dst3, size_t dst3_step)
{ {
typedef typename TypeTraits<T>::type4 src_type; typedef typename TypeTraits<T>::type4 src_type;
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -401,17 +403,17 @@ namespace cv { namespace gpu { namespace split_merge {
dst2_y[x] = src_elem.z; dst2_y[x] = src_elem.z;
dst3_y[x] = src_elem.w; dst3_y[x] = src_elem.w;
} }
} }
template <> template <>
__global__ void splitC4_<double>( __global__ void splitC4_<double>(
const uchar* src, size_t src_step, int rows, int cols, const uchar* src, size_t src_step, int rows, int cols,
uchar* dst0, size_t dst0_step, uchar* dst0, size_t dst0_step,
uchar* dst1, size_t dst1_step, uchar* dst1, size_t dst1_step,
uchar* dst2, size_t dst2_step, uchar* dst2, size_t dst2_step,
uchar* dst3, size_t dst3_step) uchar* dst3, size_t dst3_step)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -430,11 +432,11 @@ namespace cv { namespace gpu { namespace split_merge {
dst2_y[x] = src_elem2.x; dst2_y[x] = src_elem2.x;
dst3_y[x] = src_elem2.y; dst3_y[x] = src_elem2.y;
} }
} }
template <typename T> template <typename T>
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{ {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC2_<T><<<gridDim, blockDim, 0, stream>>>( splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
@ -445,12 +447,12 @@ namespace cv { namespace gpu { namespace split_merge {
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
template <typename T> template <typename T>
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{ {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC3_<T><<<gridDim, blockDim, 0, stream>>>( splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
@ -462,12 +464,12 @@ namespace cv { namespace gpu { namespace split_merge {
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
template <typename T> template <typename T>
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream) static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
{ {
dim3 blockDim(32, 8); dim3 blockDim(32, 8);
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y)); dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
splitC4_<T><<<gridDim, blockDim, 0, stream>>>( splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
@ -480,13 +482,11 @@ namespace cv { namespace gpu { namespace split_merge {
if (stream == 0) if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize()); cudaSafeCall(cudaDeviceSynchronize());
} }
extern "C" void split_caller(const DevMem2Db& src, DevMem2Db* dst, void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
int num_channels, size_t elem_size1, {
const cudaStream_t& stream)
{
static SplitFunction split_func_tbl[] = static SplitFunction split_func_tbl[] =
{ {
splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>, splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
@ -501,6 +501,8 @@ namespace cv { namespace gpu { namespace split_merge {
cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__); cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
split_func(src, dst, stream); split_func(src, dst, stream);
} }
}}} // namespace cv::gpu::split_merge } // namespace split_merge
END_OPENCV_DEVICE_NAMESPACE

View File

@ -40,23 +40,18 @@
// //
//M*/ //M*/
//#include "internal_shared.hpp" #include "internal_shared.hpp"
#include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp"
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu; namespace stereobm {
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////// Streeo BM //////////////////////////////////////////////// /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
#define ROWSperTHREAD 21 // the number of rows a thread will process #define ROWSperTHREAD 21 // the number of rows a thread will process
namespace cv { namespace gpu { namespace bm
{
#define BLOCK_W 128 // the thread block width (464) #define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8 #define N_DISPARITIES 8
@ -117,7 +112,7 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned
__syncthreads(); __syncthreads();
ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS)); ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7]))); int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
int bestIdx = 0; int bestIdx = 0;
for (int i = 0; i < N_DISPARITIES; i++) for (int i = 0; i < N_DISPARITIES; i++)
@ -252,7 +247,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step ) for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
*ptr = 0xFFFFFFFF; *ptr = 0xFFFFFFFF;
}*/ }*/
int end_row = min(ROWSperTHREAD, cheight - Y - RADIUS); int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
int y_tex; int y_tex;
int x_tex = X - RADIUS; int x_tex = X - RADIUS;
@ -346,7 +341,7 @@ const static kernel_caller_t callers[] =
}; };
const int calles_num = sizeof(callers)/sizeof(callers[0]); const int calles_num = sizeof(callers)/sizeof(callers[0]);
extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream) void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
{ {
int winsz2 = winsz >> 1; int winsz2 = winsz >> 1;
@ -375,7 +370,7 @@ extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, cons
texture<unsigned char, 2, cudaReadModeElementType> texForSobel; texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
extern "C" __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap) __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
{ {
int x = blockDim.x * blockIdx.x + threadIdx.x; int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y; int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -387,12 +382,12 @@ extern "C" __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
(int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1); (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255); conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
output.ptr(y)[x] = conv & 0xFF; output.ptr(y)[x] = conv & 0xFF;
} }
} }
extern "C" void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream) void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
{ {
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>(); cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) ); cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
@ -451,7 +446,7 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
#define RpT (2 * ROWSperTHREAD) // got experimentally #define RpT (2 * ROWSperTHREAD) // got experimentally
extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold) __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
{ {
int winsz2 = winsz/2; int winsz2 = winsz/2;
int n_dirty_pixels = (winsz2) * 2; int n_dirty_pixels = (winsz2) * 2;
@ -462,7 +457,7 @@ extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float t
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int beg_row = blockIdx.y * RpT; int beg_row = blockIdx.y * RpT;
int end_row = min(beg_row + RpT, disp.rows); int end_row = ::min(beg_row + RpT, disp.rows);
if (x < disp.cols) if (x < disp.cols)
{ {
@ -510,7 +505,7 @@ extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float t
} }
} }
extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream) void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
{ {
avgTexturenessThreshold *= winsz * winsz; avgTexturenessThreshold *= winsz * winsz;
@ -537,4 +532,6 @@ extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float
cudaSafeCall( cudaUnbindTexture (texForTF) ); cudaSafeCall( cudaUnbindTexture (texForTF) );
} }
}}} } // namespace stereobm
END_OPENCV_DEVICE_NAMESPACE

View File

@ -44,49 +44,48 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace stereobp {
namespace cv { namespace gpu { namespace bp
{
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// load constants //////////////////////// /////////////////////// load constants ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
__constant__ int cndisp; __constant__ int cndisp;
__constant__ float cmax_data_term; __constant__ float cmax_data_term;
__constant__ float cdata_weight; __constant__ float cdata_weight;
__constant__ float cmax_disc_term; __constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump; __constant__ float cdisc_single_jump;
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump) void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
{ {
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) ); cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int )) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cdata_weight, &data_weight, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term, &max_disc_term, sizeof(float)) );
cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
////////////////////////// comp data ////////////////////////// ////////////////////////// comp data //////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <int cn> struct PixDiff; template <int cn> struct PixDiff;
template <> struct PixDiff<1> template <> struct PixDiff<1>
{ {
__device__ __forceinline__ PixDiff(const uchar* ls) __device__ __forceinline__ PixDiff(const uchar* ls)
{ {
l = *ls; l = *ls;
} }
__device__ __forceinline__ float operator()(const uchar* rs) const __device__ __forceinline__ float operator()(const uchar* rs) const
{ {
return abs((int)l - *rs); return ::abs((int)l - *rs);
} }
uchar l; uchar l;
}; };
template <> struct PixDiff<3> template <> struct PixDiff<3>
{ {
__device__ __forceinline__ PixDiff(const uchar* ls) __device__ __forceinline__ PixDiff(const uchar* ls)
{ {
l = *((uchar3*)ls); l = *((uchar3*)ls);
@ -97,16 +96,16 @@ namespace cv { namespace gpu { namespace bp
const float tg = 0.587f; const float tg = 0.587f;
const float tb = 0.114f; const float tb = 0.114f;
float val = tb * abs((int)l.x - rs[0]); float val = tb * ::abs((int)l.x - rs[0]);
val += tg * abs((int)l.y - rs[1]); val += tg * ::abs((int)l.y - rs[1]);
val += tr * abs((int)l.z - rs[2]); val += tr * ::abs((int)l.z - rs[2]);
return val; return val;
} }
uchar3 l; uchar3 l;
}; };
template <> struct PixDiff<4> template <> struct PixDiff<4>
{ {
__device__ __forceinline__ PixDiff(const uchar* ls) __device__ __forceinline__ PixDiff(const uchar* ls)
{ {
l = *((uchar4*)ls); l = *((uchar4*)ls);
@ -119,18 +118,18 @@ namespace cv { namespace gpu { namespace bp
uchar4 r = *((uchar4*)rs); uchar4 r = *((uchar4*)rs);
float val = tb * abs((int)l.x - r.x); float val = tb * ::abs((int)l.x - r.x);
val += tg * abs((int)l.y - r.y); val += tg * ::abs((int)l.y - r.y);
val += tr * abs((int)l.z - r.z); val += tr * ::abs((int)l.z - r.z);
return val; return val;
} }
uchar4 l; uchar4 l;
}; };
template <int cn, typename D> template <int cn, typename D>
__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data) __global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -157,13 +156,13 @@ namespace cv { namespace gpu { namespace bp
} }
} }
} }
} }
template<typename T, typename D> template<typename T, typename D>
void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream); void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream) template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -175,9 +174,9 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream) template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -189,10 +188,10 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream) template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -204,9 +203,9 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream) template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -218,10 +217,10 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream) template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -233,9 +232,9 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream) template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -247,15 +246,15 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////////// data step down /////////////////////// //////////////////////// data step down ///////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T> template <typename T>
__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst) __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -271,11 +270,11 @@ namespace cv { namespace gpu { namespace bp
dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg); dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
} }
} }
} }
template<typename T> template<typename T>
void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -287,18 +286,18 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream); template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream); template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////// level up messages //////////////////////// /////////////////// level up messages ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T> template <typename T>
__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst) __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -313,11 +312,11 @@ namespace cv { namespace gpu { namespace bp
for (int d = 0; d < cndisp; ++d) for (int d = 0; d < cndisp; ++d)
dstr[d * dst_disp_step] = srcr[d * src_disp_step]; dstr[d * dst_disp_step] = srcr[d * src_disp_step];
} }
} }
template <typename T> template <typename T>
void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream) void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -328,27 +327,30 @@ namespace cv { namespace gpu { namespace bp
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]); level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream); template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream); template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T> template <typename T>
__device__ void calc_min_linear_penalty(T* dst, size_t step) __device__ void calc_min_linear_penalty(T* dst, size_t step)
{ {
float prev = dst[0]; float prev = dst[0];
float cur; float cur;
for (int disp = 1; disp < cndisp; ++disp) for (int disp = 1; disp < cndisp; ++disp)
@ -375,12 +377,12 @@ namespace cv { namespace gpu { namespace bp
} }
prev = cur; prev = cur;
} }
} }
template <typename T> template <typename T>
__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step) __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
{ {
float minimum = numeric_limits<float>::max(); float minimum = device::numeric_limits<float>::max();
for(int i = 0; i < cndisp; ++i) for(int i = 0; i < cndisp; ++i)
{ {
@ -414,11 +416,11 @@ namespace cv { namespace gpu { namespace bp
for(int i = 0; i < cndisp; ++i) for(int i = 0; i < cndisp; ++i)
dst[msg_disp_step * i] -= sum; dst[msg_disp_step * i] -= sum;
} }
template <typename T> template <typename T>
__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows) __global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
{ {
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
@ -438,12 +440,12 @@ namespace cv { namespace gpu { namespace bp
message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step); message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step); message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
} }
} }
template <typename T> template <typename T>
void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream) const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -458,19 +460,19 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream); template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream); template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////////// output //////////////////////////// /////////////////////////// output ////////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T> template <typename T>
__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data, __global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
DevMem2D_<short> disp) DevMem2D_<short> disp)
{ {
const int x = blockIdx.x * blockDim.x + threadIdx.x; const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y; const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -503,12 +505,12 @@ namespace cv { namespace gpu { namespace bp
disp.ptr(y)[x] = saturate_cast<short>(best); disp.ptr(y)[x] = saturate_cast<short>(best);
} }
} }
template <typename T> template <typename T>
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
const DevMem2D_<short>& disp, cudaStream_t stream) const DevMem2D_<short>& disp, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -520,8 +522,11 @@ namespace cv { namespace gpu { namespace bp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream); template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream); template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
}}}
} // namespace stereobp
END_OPENCV_DEVICE_NAMESPACE

View File

@ -44,40 +44,37 @@
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
namespace stereocsbp {
namespace cv { namespace gpu { namespace csbp
{
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// load constants //////////////////////// /////////////////////// load constants ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
__constant__ int cndisp; __constant__ int cndisp;
__constant__ float cmax_data_term; __constant__ float cmax_data_term;
__constant__ float cdata_weight; __constant__ float cdata_weight;
__constant__ float cmax_disc_term; __constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump; __constant__ float cdisc_single_jump;
__constant__ int cth; __constant__ int cth;
__constant__ size_t cimg_step; __constant__ size_t cimg_step;
__constant__ size_t cmsg_step1; __constant__ size_t cmsg_step1;
__constant__ size_t cmsg_step2; __constant__ size_t cmsg_step2;
__constant__ size_t cdisp_step1; __constant__ size_t cdisp_step1;
__constant__ size_t cdisp_step2; __constant__ size_t cdisp_step2;
__constant__ uchar* cleft; __constant__ uchar* cleft;
__constant__ uchar* cright; __constant__ uchar* cright;
__constant__ uchar* ctemp; __constant__ uchar* ctemp;
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th, void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp) const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)
{ {
cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) ); cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term, &max_data_term, sizeof(float)) );
@ -92,49 +89,49 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaMemcpyToSymbol(cleft, &left.data, sizeof(left.data)) ); cudaSafeCall( cudaMemcpyToSymbol(cleft, &left.data, sizeof(left.data)) );
cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) ); cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) ); cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
/////////////////////// init data cost //////////////////////// /////////////////////// init data cost ////////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <int channels> struct DataCostPerPixel; template <int channels> struct DataCostPerPixel;
template <> struct DataCostPerPixel<1> template <> struct DataCostPerPixel<1>
{ {
static __device__ __forceinline__ float compute(const uchar* left, const uchar* right) static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
{ {
return fmin(cdata_weight * abs((int)*left - *right), cdata_weight * cmax_data_term); return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
} }
}; };
template <> struct DataCostPerPixel<3> template <> struct DataCostPerPixel<3>
{ {
static __device__ __forceinline__ float compute(const uchar* left, const uchar* right) static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
{ {
float tb = 0.114f * abs((int)left[0] - right[0]); float tb = 0.114f * ::abs((int)left[0] - right[0]);
float tg = 0.587f * abs((int)left[1] - right[1]); float tg = 0.587f * ::abs((int)left[1] - right[1]);
float tr = 0.299f * abs((int)left[2] - right[2]); float tr = 0.299f * ::abs((int)left[2] - right[2]);
return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term); return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
} }
}; };
template <> struct DataCostPerPixel<4> template <> struct DataCostPerPixel<4>
{ {
static __device__ __forceinline__ float compute(const uchar* left, const uchar* right) static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
{ {
uchar4 l = *((const uchar4*)left); uchar4 l = *((const uchar4*)left);
uchar4 r = *((const uchar4*)right); uchar4 r = *((const uchar4*)right);
float tb = 0.114f * abs((int)l.x - r.x); float tb = 0.114f * ::abs((int)l.x - r.x);
float tg = 0.587f * abs((int)l.y - r.y); float tg = 0.587f * ::abs((int)l.y - r.y);
float tr = 0.299f * abs((int)l.z - r.z); float tr = 0.299f * ::abs((int)l.z - r.z);
return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term); return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
} }
}; };
template <typename T> template <typename T>
__global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane) __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -146,7 +143,7 @@ namespace cv { namespace gpu { namespace csbp
for(int i = 0; i < nr_plane; i++) for(int i = 0; i < nr_plane; i++)
{ {
T minimum = numeric_limits<T>::max(); T minimum = device::numeric_limits<T>::max();
int id = 0; int id = 0;
for(int d = 0; d < cndisp; d++) for(int d = 0; d < cndisp; d++)
{ {
@ -163,12 +160,12 @@ namespace cv { namespace gpu { namespace csbp
data_cost [id * cdisp_step1] = numeric_limits<T>::max(); data_cost [id * cdisp_step1] = numeric_limits<T>::max();
} }
} }
} }
template <typename T> template <typename T>
__global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane) __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -220,11 +217,11 @@ namespace cv { namespace gpu { namespace csbp
data_cost[id * cdisp_step1] = numeric_limits<T>::max(); data_cost[id * cdisp_step1] = numeric_limits<T>::max();
} }
} }
} }
template <typename T, int channels> template <typename T, int channels>
__global__ void init_data_cost(int h, int w, int level) __global__ void init_data_cost(int h, int w, int level)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -260,11 +257,11 @@ namespace cv { namespace gpu { namespace csbp
data_cost[cdisp_step1 * d] = saturate_cast<T>(val); data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
} }
} }
} }
template <typename T, int winsz, int channels> template <typename T, int winsz, int channels>
__global__ void init_data_cost_reduce(int level, int rows, int cols, int h) __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
{ {
int x_out = blockIdx.x; int x_out = blockIdx.x;
int y_out = blockIdx.y % h; int y_out = blockIdx.y % h;
int d = (blockIdx.y / h) * blockDim.z + threadIdx.z; int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
@ -276,7 +273,7 @@ namespace cv { namespace gpu { namespace csbp
int x0 = x_out << level; int x0 = x_out << level;
int y0 = y_out << level; int y0 = y_out << level;
int len = min(y0 + winsz, rows) - y0; int len = ::min(y0 + winsz, rows) - y0;
float val = 0.0f; float val = 0.0f;
if (x0 + tid < cols) if (x0 + tid < cols)
@ -322,12 +319,12 @@ namespace cv { namespace gpu { namespace csbp
if (tid == 0) if (tid == 0)
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]); data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
} }
} }
template <typename T> template <typename T>
void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream) void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -341,11 +338,11 @@ namespace cv { namespace gpu { namespace csbp
case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break; case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
template <typename T, int winsz> template <typename T, int winsz>
void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream) void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
{ {
const int threadsNum = 256; const int threadsNum = 256;
const size_t smem_size = threadsNum * sizeof(float); const size_t smem_size = threadsNum * sizeof(float);
@ -360,12 +357,12 @@ namespace cv { namespace gpu { namespace csbp
case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break; case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
template<class T> template<class T>
void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step, void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream) int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
{ {
typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream); typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
@ -401,21 +398,21 @@ namespace cv { namespace gpu { namespace csbp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step, template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream); int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step, template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream); int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
////////////////////// compute data cost ////////////////////// ////////////////////// compute data cost //////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T, int channels> template <typename T, int channels>
__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane) __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -454,11 +451,11 @@ namespace cv { namespace gpu { namespace csbp
data_cost[cdisp_step1 * d] = saturate_cast<T>(val); data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
} }
} }
} }
template <typename T, int winsz, int channels> template <typename T, int winsz, int channels>
__global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane) __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
{ {
int x_out = blockIdx.x; int x_out = blockIdx.x;
int y_out = blockIdx.y % h; int y_out = blockIdx.y % h;
int d = (blockIdx.y / h) * blockDim.z + threadIdx.z; int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
@ -475,7 +472,7 @@ namespace cv { namespace gpu { namespace csbp
int x0 = x_out << level; int x0 = x_out << level;
int y0 = y_out << level; int y0 = y_out << level;
int len = min(y0 + winsz, rows) - y0; int len = ::min(y0 + winsz, rows) - y0;
float val = 0.0f; float val = 0.0f;
if (x0 + tid < cols) if (x0 + tid < cols)
@ -519,12 +516,12 @@ namespace cv { namespace gpu { namespace csbp
if (tid == 0) if (tid == 0)
data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]); data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
} }
} }
template <typename T> template <typename T>
void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/, void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream) int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
{ {
dim3 threads(32, 8, 1); dim3 threads(32, 8, 1);
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
@ -538,12 +535,12 @@ namespace cv { namespace gpu { namespace csbp
case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break; case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
template <typename T, int winsz> template <typename T, int winsz>
void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols, void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream) int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
{ {
const int threadsNum = 256; const int threadsNum = 256;
const size_t smem_size = threadsNum * sizeof(float); const size_t smem_size = threadsNum * sizeof(float);
@ -558,12 +555,12 @@ namespace cv { namespace gpu { namespace csbp
case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break; case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
} }
} }
template<class T> template<class T>
void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2, void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream) int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
{ {
typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols, typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
int h, int w, int level, int nr_plane, int channels, cudaStream_t stream); int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
@ -586,12 +583,12 @@ namespace cv { namespace gpu { namespace csbp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2, template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream); int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2, template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,
int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream); int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
@ -601,12 +598,12 @@ namespace cv { namespace gpu { namespace csbp
template <typename T> template <typename T>
__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new, __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur, const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
T* data_cost_selected, T* disparity_selected_new, T* data_cost_new, T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
const T* data_cost_cur, const T* disparity_selected_cur, const T* data_cost_cur, const T* disparity_selected_cur,
int nr_plane, int nr_plane2) int nr_plane, int nr_plane2)
{ {
for(int i = 0; i < nr_plane; i++) for(int i = 0; i < nr_plane; i++)
{ {
T minimum = numeric_limits<T>::max(); T minimum = numeric_limits<T>::max();
@ -631,24 +628,24 @@ namespace cv { namespace gpu { namespace csbp
data_cost_new[id * cdisp_step1] = numeric_limits<T>::max(); data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
} }
} }
template <typename T> template <typename T>
__global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_, __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_, const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
T* selected_disp_pyr_new, const T* selected_disp_pyr_cur, T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
T* data_cost_selected_, const T* data_cost_, T* data_cost_selected_, const T* data_cost_,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2) int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < h && x < w) if (y < h && x < w)
{ {
const T* u_cur = u_cur_ + min(h2-1, y/2 + 1) * cmsg_step2 + x/2; const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
const T* d_cur = d_cur_ + max(0, y/2 - 1) * cmsg_step2 + x/2; const T* d_cur = d_cur_ + ::max(0, y/2 - 1) * cmsg_step2 + x/2;
const T* l_cur = l_cur_ + y/2 * cmsg_step2 + min(w2-1, x/2 + 1); const T* l_cur = l_cur_ + y/2 * cmsg_step2 + ::min(w2-1, x/2 + 1);
const T* r_cur = r_cur_ + y/2 * cmsg_step2 + max(0, x/2 - 1); const T* r_cur = r_cur_ + y/2 * cmsg_step2 + ::max(0, x/2 - 1);
T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x; T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;
@ -680,16 +677,16 @@ namespace cv { namespace gpu { namespace csbp
data_cost_selected, disparity_selected_new, data_cost_new, data_cost_selected, disparity_selected_new, data_cost_new,
data_cost, disparity_selected_cur, nr_plane, nr_plane2); data_cost, disparity_selected_cur, nr_plane, nr_plane2);
} }
} }
template<class T> template<class T>
void init_message(T* u_new, T* d_new, T* l_new, T* r_new, void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur, const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
T* selected_disp_pyr_new, const T* selected_disp_pyr_cur, T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2, T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream) int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
{ {
size_t disp_step1 = msg_step1 * h; size_t disp_step1 = msg_step1 * h;
size_t disp_step2 = msg_step2 * h2; size_t disp_step2 = msg_step2 * h2;
@ -713,16 +710,16 @@ namespace cv { namespace gpu { namespace csbp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void init_message(short* u_new, short* d_new, short* l_new, short* r_new, template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur, const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
short* selected_disp_pyr_new, const short* selected_disp_pyr_cur, short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2, short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,
int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream); int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
template void init_message(float* u_new, float* d_new, float* l_new, float* r_new, template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur, const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
float* selected_disp_pyr_new, const float* selected_disp_pyr_cur, float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2, float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,
@ -732,10 +729,10 @@ namespace cv { namespace gpu { namespace csbp
//////////////////// calc all iterations ///////////////////// //////////////////// calc all iterations /////////////////////
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T> template <typename T>
__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3, __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
const T* dst_disp, const T* src_disp, int nr_plane, T* temp) const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
{ {
T minimum = numeric_limits<T>::max(); T minimum = numeric_limits<T>::max();
for(int d = 0; d < nr_plane; d++) for(int d = 0; d < nr_plane; d++)
@ -756,7 +753,7 @@ namespace cv { namespace gpu { namespace csbp
T src_disp_reg = src_disp[d * cdisp_step1]; T src_disp_reg = src_disp[d * cdisp_step1];
for(int d2 = 0; d2 < nr_plane; d2++) for(int d2 = 0; d2 < nr_plane; d2++)
cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * abs(dst_disp[d2 * cdisp_step1] - src_disp_reg)); cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
temp[d * cdisp_step1] = saturate_cast<T>(cost_min); temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
sum += cost_min; sum += cost_min;
@ -765,11 +762,11 @@ namespace cv { namespace gpu { namespace csbp
for(int d = 0; d < nr_plane; d++) for(int d = 0; d < nr_plane; d++)
msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum); msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
} }
template <typename T> template <typename T>
__global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i) __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
{ {
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1); int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
@ -791,13 +788,13 @@ namespace cv { namespace gpu { namespace csbp
message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp); message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);
message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp); message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);
} }
} }
template<class T> template<class T>
void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected, void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream) const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
{ {
size_t disp_step = msg_step * h; size_t disp_step = msg_step * h;
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
@ -816,12 +813,12 @@ namespace cv { namespace gpu { namespace csbp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step, template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
int h, int w, int nr_plane, int iters, cudaStream_t stream); int h, int w, int nr_plane, int iters, cudaStream_t stream);
template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
int h, int w, int nr_plane, int iters, cudaStream_t stream); int h, int w, int nr_plane, int iters, cudaStream_t stream);
@ -830,11 +827,11 @@ namespace cv { namespace gpu { namespace csbp
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
template <typename T> template <typename T>
__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_, __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
const T* data_cost_selected, const T* disp_selected_pyr, const T* data_cost_selected, const T* disp_selected_pyr,
short* disp, size_t res_step, int cols, int rows, int nr_plane) short* disp, size_t res_step, int cols, int rows, int nr_plane)
{ {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -863,12 +860,12 @@ namespace cv { namespace gpu { namespace csbp
} }
disp[res_step * y + x] = best; disp[res_step * y + x] = best;
} }
} }
template<class T> template<class T>
void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step, void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream) const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
{ {
size_t disp_step = disp.rows * msg_step; size_t disp_step = disp.rows * msg_step;
cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) ); cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
@ -885,11 +882,14 @@ namespace cv { namespace gpu { namespace csbp
if (stream == 0) if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream); const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step, template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream); const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
}}}
} // namespace stereocsbp
END_OPENCV_DEVICE_NAMESPACE

View File

@ -52,47 +52,75 @@
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/filters.hpp" #include "opencv2/gpu/device/filters.hpp"
using namespace cv::gpu; BEGIN_OPENCV_DEVICE_NAMESPACE
using namespace cv::gpu::device;
#define CV_PI 3.1415926535897932384626433832795f namespace surf {
namespace cv { namespace gpu { namespace surf ////////////////////////////////////////////////////////////////////////
// Global parameters
// The maximum number of features (before subpixel interpolation) that memory is reserved for.
__constant__ int c_max_candidates;
// The maximum number of features that memory is reserved for.
__constant__ int c_max_features;
// The image size.
__constant__ int c_img_rows;
__constant__ int c_img_cols;
// The number of layers.
__constant__ int c_nOctaveLayers;
// The hessian threshold.
__constant__ float c_hessianThreshold;
// The current octave.
__constant__ int c_octave;
// The current layer size.
__constant__ int c_layer_rows;
__constant__ int c_layer_cols;
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
{ {
//////////////////////////////////////////////////////////////////////// cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
// Global parameters cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
}
// The maximum number of features (before subpixel interpolation) that memory is reserved for. void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
__constant__ int c_max_candidates; {
// The maximum number of features that memory is reserved for. cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
__constant__ int c_max_features; cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
// The image size. cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
__constant__ int c_img_rows; }
__constant__ int c_img_cols;
// The number of layers.
__constant__ int c_nOctaveLayers;
// The hessian threshold.
__constant__ float c_hessianThreshold;
// The current octave. ////////////////////////////////////////////////////////////////////////
__constant__ int c_octave; // Integral image texture
// The current layer size.
__constant__ int c_layer_rows;
__constant__ int c_layer_cols;
//////////////////////////////////////////////////////////////////////// texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
// Integral image texture texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp); void bindImgTex(DevMem2Db img)
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp); {
bindTexture(&imgTex, img);
}
void bindSumTex(DevMem2D_<uint> sum)
{
bindTexture(&sumTex, sum);
}
void bindMaskSumTex(DevMem2D_<uint> maskSum)
{
bindTexture(&maskSumTex, maskSum);
}
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x) template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 #if __CUDA_ARCH__ >= 200
typedef double real_t; typedef double real_t;
#else #else
typedef float real_t; typedef float real_t;
#endif #endif
float ratio = (float)newSize / oldSize; float ratio = (float)newSize / oldSize;
@ -116,17 +144,17 @@ namespace cv { namespace gpu { namespace surf
} }
return (float)d; return (float)d;
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Hessian // Hessian
__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} }; __constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} }; __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} }; __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
__host__ __device__ __forceinline__ int calcSize(int octave, int layer) __host__ __device__ __forceinline__ int calcSize(int octave, int layer)
{ {
/* Wavelet size at first layer of first octave. */ /* Wavelet size at first layer of first octave. */
const int HAAR_SIZE0 = 9; const int HAAR_SIZE0 = 9;
@ -137,10 +165,10 @@ namespace cv { namespace gpu { namespace surf
const int HAAR_SIZE_INC = 6; const int HAAR_SIZE_INC = 6;
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave; return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
} }
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace) __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
{ {
// Determine the indices // Determine the indices
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2); const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
const int blockIdx_y = blockIdx.y % gridDim_y; const int blockIdx_y = blockIdx.y % gridDim_y;
@ -167,10 +195,10 @@ namespace cv { namespace gpu { namespace surf
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy; det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy; trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
} }
} }
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers) void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
{ {
const int min_size = calcSize(octave, 0); const int min_size = calcSize(octave, 0);
const int max_samples_i = 1 + ((img_rows - min_size) >> octave); const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
const int max_samples_j = 1 + ((img_cols - min_size) >> octave); const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
@ -185,23 +213,15 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// NONMAX // NONMAX
struct WithOutMask __constant__ float c_DM[5] = {0, 0, 9, 9, 1};
{
static __device__ __forceinline__ bool check(int, int, int)
{
return true;
}
};
__constant__ float c_DM[5] = {0, 0, 9, 9, 1}; struct WithMask
{
struct WithMask
{
static __device__ bool check(int sum_i, int sum_j, int size) static __device__ bool check(int sum_i, int sum_j, int size)
{ {
float ratio = (float)size / 9.0f; float ratio = (float)size / 9.0f;
@ -223,11 +243,11 @@ namespace cv { namespace gpu { namespace surf
return (d >= 0.5f); return (d >= 0.5f);
} }
}; };
template <typename Mask> template <typename Mask>
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter) __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
extern __shared__ float N9[]; extern __shared__ float N9[];
@ -250,9 +270,9 @@ namespace cv { namespace gpu { namespace surf
// Is this thread within the hessian buffer? // Is this thread within the hessian buffer?
const int zoff = blockDim.x * blockDim.y; const int zoff = blockDim.x * blockDim.y;
const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff; const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1))[min(max(j, 0), c_img_cols - 1)]; N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
N9[localLin ] = det.ptr(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1))[min(max(j, 0), c_img_cols - 1)]; N9[localLin ] = det.ptr(c_layer_rows * (layer ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1))[min(max(j, 0), c_img_cols - 1)]; N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
__syncthreads(); __syncthreads();
if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1) if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
@ -316,11 +336,11 @@ namespace cv { namespace gpu { namespace surf
} }
#endif #endif
} }
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers) int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{ {
const int layer_rows = img_rows >> octave; const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave; const int layer_cols = img_cols >> octave;
@ -342,15 +362,15 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// INTERPOLATION // INTERPOLATION
__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer, __global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
unsigned int* featureCounter) unsigned int* featureCounter)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
const int4 maxPos = maxPosBuffer[blockIdx.x]; const int4 maxPos = maxPosBuffer[blockIdx.x];
@ -400,7 +420,7 @@ namespace cv { namespace gpu { namespace surf
if (solve3x3(H, dD, x)) if (solve3x3(H, dD, x))
{ {
if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f) if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
{ {
// if the step is within the interpolation region, perform it // if the step is within the interpolation region, perform it
@ -448,12 +468,12 @@ namespace cv { namespace gpu { namespace surf
} // If this is thread 0. } // If this is thread 0.
#endif #endif
} }
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
unsigned int* featureCounter) unsigned int* featureCounter)
{ {
dim3 threads; dim3 threads;
threads.x = 3; threads.x = 3;
threads.y = 3; threads.y = 3;
@ -466,24 +486,24 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Orientation // Orientation
#define ORI_SEARCH_INC 5 #define ORI_SEARCH_INC 5
#define ORI_WIN 60 #define ORI_WIN 60
#define ORI_SAMPLES 113 #define ORI_SAMPLES 113
__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; __constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; __constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f}; __constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}}; __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}}; __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir) __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{ {
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ float s_X[128]; __shared__ float s_X[128];
@ -540,25 +560,25 @@ namespace cv { namespace gpu { namespace surf
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC; const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
float sumx = 0.0f, sumy = 0.0f; float sumx = 0.0f, sumy = 0.0f;
int d = abs(__float2int_rn(s_angle[threadIdx.x]) - dir); int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{ {
sumx = s_X[threadIdx.x]; sumx = s_X[threadIdx.x];
sumy = s_Y[threadIdx.x]; sumy = s_Y[threadIdx.x];
} }
d = abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir); d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{ {
sumx += s_X[threadIdx.x + 32]; sumx += s_X[threadIdx.x + 32];
sumy += s_Y[threadIdx.x + 32]; sumy += s_Y[threadIdx.x + 32];
} }
d = abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir); d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{ {
sumx += s_X[threadIdx.x + 64]; sumx += s_X[threadIdx.x + 64];
sumy += s_Y[threadIdx.x + 64]; sumy += s_Y[threadIdx.x + 64];
} }
d = abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir); d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2) if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{ {
sumx += s_X[threadIdx.x + 96]; sumx += s_X[threadIdx.x + 96];
@ -567,8 +587,8 @@ namespace cv { namespace gpu { namespace surf
float* s_sum_row = s_sum + threadIdx.y * 32; float* s_sum_row = s_sum + threadIdx.y * 32;
reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>()); device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>()); device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
const float temp_mod = sumx * sumx + sumy * sumy; const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod) if (temp_mod > best_mod)
@ -624,14 +644,14 @@ namespace cv { namespace gpu { namespace surf
} }
#endif #endif
} }
#undef ORI_SEARCH_INC #undef ORI_SEARCH_INC
#undef ORI_WIN #undef ORI_WIN
#undef ORI_SAMPLES #undef ORI_SAMPLES
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
{ {
dim3 threads; dim3 threads;
threads.x = 32; threads.x = 32;
threads.y = 4; threads.y = 4;
@ -643,17 +663,15 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Descriptors // Descriptors
#define PATCH_SZ 20 #define PATCH_SZ 20
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp); __constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
{
__constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
{
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
@ -674,10 +692,10 @@ namespace cv { namespace gpu { namespace surf
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f 3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
}; };
struct WinReader struct WinReader
{ {
typedef uchar elem_type; typedef uchar elem_type;
__device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) :
@ -698,11 +716,11 @@ namespace cv { namespace gpu { namespace surf
float win_offset; float win_offset;
float cos_dir; float cos_dir;
float sin_dir; float sin_dir;
}; };
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25],
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{ {
__shared__ float s_PATCH[6][6]; __shared__ float s_PATCH[6][6];
const float centerX = featureX[blockIdx.x]; const float centerX = featureX[blockIdx.x];
@ -752,10 +770,10 @@ namespace cv { namespace gpu { namespace surf
s_dx_bin[tid] = vx; s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy; s_dy_bin[tid] = vy;
} }
} }
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid) __device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
{ {
// first step is to reduce from 25 to 16 // first step is to reduce from 25 to 16
if (tid < 9) // use 9 threads if (tid < 9) // use 9 threads
{ {
@ -788,10 +806,10 @@ namespace cv { namespace gpu { namespace surf
sdata4[tid] += sdata4[tid + 2]; sdata4[tid] += sdata4[tid + 2];
sdata4[tid] += sdata4[tid + 1]; sdata4[tid] += sdata4[tid + 1];
} }
} }
__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) __global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{ {
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25]; __shared__ float sdx[25];
__shared__ float sdy[25]; __shared__ float sdy[25];
@ -805,8 +823,8 @@ namespace cv { namespace gpu { namespace surf
if (tid < 25) if (tid < 25)
{ {
sdxabs[tid] = fabs(sdx[tid]); // |dx| array sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
sdyabs[tid] = fabs(sdy[tid]); // |dy| array sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
__syncthreads(); __syncthreads();
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
@ -823,10 +841,10 @@ namespace cv { namespace gpu { namespace surf
descriptors_block[3] = sdyabs[0]; descriptors_block[3] = sdyabs[0];
} }
} }
} }
__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir) __global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{ {
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region) // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25]; __shared__ float sdx[25];
__shared__ float sdy[25]; __shared__ float sdy[25];
@ -847,7 +865,7 @@ namespace cv { namespace gpu { namespace surf
if (sdy[tid] >= 0) if (sdy[tid] >= 0)
{ {
sd1[tid] = sdx[tid]; sd1[tid] = sdx[tid];
sdabs1[tid] = fabs(sdx[tid]); sdabs1[tid] = ::fabs(sdx[tid]);
sd2[tid] = 0; sd2[tid] = 0;
sdabs2[tid] = 0; sdabs2[tid] = 0;
} }
@ -856,7 +874,7 @@ namespace cv { namespace gpu { namespace surf
sd1[tid] = 0; sd1[tid] = 0;
sdabs1[tid] = 0; sdabs1[tid] = 0;
sd2[tid] = sdx[tid]; sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]); sdabs2[tid] = ::fabs(sdx[tid]);
} }
__syncthreads(); __syncthreads();
@ -878,7 +896,7 @@ namespace cv { namespace gpu { namespace surf
if (sdx[tid] >= 0) if (sdx[tid] >= 0)
{ {
sd1[tid] = sdy[tid]; sd1[tid] = sdy[tid];
sdabs1[tid] = fabs(sdy[tid]); sdabs1[tid] = ::fabs(sdy[tid]);
sd2[tid] = 0; sd2[tid] = 0;
sdabs2[tid] = 0; sdabs2[tid] = 0;
} }
@ -887,7 +905,7 @@ namespace cv { namespace gpu { namespace surf
sd1[tid] = 0; sd1[tid] = 0;
sdabs1[tid] = 0; sdabs1[tid] = 0;
sd2[tid] = sdy[tid]; sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]); sdabs2[tid] = ::fabs(sdy[tid]);
} }
__syncthreads(); __syncthreads();
@ -903,10 +921,10 @@ namespace cv { namespace gpu { namespace surf
descriptors_block[7] = sdabs2[0]; descriptors_block[7] = sdabs2[0];
} }
} }
} }
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors) template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
{ {
// no need for thread ID // no need for thread ID
float* descriptor_base = descriptors.ptr(blockIdx.x); float* descriptor_base = descriptors.ptr(blockIdx.x);
@ -946,11 +964,11 @@ namespace cv { namespace gpu { namespace surf
// normalize and store in output // normalize and store in output
descriptor_base[threadIdx.x] = lookup / len; descriptor_base[threadIdx.x] = lookup / len;
} }
void compute_descriptors_gpu(const DevMem2Df& descriptors, void compute_descriptors_gpu(const DevMem2Df& descriptors,
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures) const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
{ {
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64) if (descriptors.cols == 64)
@ -977,5 +995,8 @@ namespace cv { namespace gpu { namespace surf
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
} }
}}}
} // namespace surf
END_OPENCV_DEVICE_NAMESPACE

View File

@ -71,16 +71,20 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
#include "opencv2/gpu/stream_accessor.hpp" #include "opencv2/gpu/stream_accessor.hpp"
namespace cv { namespace gpu { namespace device { BEGIN_OPENCV_DEVICE_NAMESPACE
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
template <typename T> void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0); template <typename T>
}}} void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE;
struct Stream::Impl struct Stream::Impl
{ {
@ -101,14 +105,14 @@ namespace
void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream) void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
{ {
Scalar_<T> sf = s; Scalar_<T> sf = s;
device::set_to_gpu(src, sf.val, src.channels(), stream); set_to_gpu(src, sf.val, src.channels(), stream);
} }
template <typename T> template <typename T>
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream) void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
{ {
Scalar_<T> sf = s; Scalar_<T> sf = s;
device::set_to_gpu(src, sf.val, mask, src.channels(), stream); set_to_gpu(src, sf.val, mask, src.channels(), stream);
} }
} }
@ -255,7 +259,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
psrc = &(temp = src); psrc = &(temp = src);
dst.create( src.size(), rtype ); dst.create( src.size(), rtype );
device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream); convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
} }
cv::gpu::Stream::operator bool() const cv::gpu::Stream::operator bool() const

View File

@ -123,18 +123,19 @@ namespace
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// add // add
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T, typename D>
void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
template <typename T, typename D> template <typename T, typename D>
void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream); void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
}}}
template <typename T, typename D>
void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
@ -173,7 +174,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
@ -235,18 +236,19 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// subtract // subtract
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T, typename D>
void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
template <typename T, typename D> template <typename T, typename D>
void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream); void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
}}}
template <typename T, typename D>
void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
@ -285,7 +287,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s) void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
@ -347,21 +349,22 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// multiply // multiply
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
template <typename T, typename D> void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream); void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
template <typename T, typename D> template <typename T, typename D>
void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream); void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
}}}
template <typename T, typename D>
void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s) void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
@ -419,7 +422,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub
void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s) void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
@ -469,24 +472,25 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// divide // divide
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
template <typename T, typename D> void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream); void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
template <typename T, typename D> template <typename T, typename D>
void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream); void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
template <typename T, typename D> template <typename T, typename D>
void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
}}}
template <typename T, typename D>
void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s) void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
@ -544,7 +548,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double
void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s) void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
@ -593,7 +597,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s) void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@ -626,18 +630,19 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// absdiff // absdiff
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T>
void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
template <typename T> template <typename T>
void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream); void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
}}}
template <typename T>
void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s) void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@ -709,7 +714,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s) void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
@ -753,17 +758,18 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Comparison of two matrixes // Comparison of two matrixes
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
}}}
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream) void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
{ {
using namespace cv::gpu::device; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@ -829,13 +835,14 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Unary bitwise logical operations // Unary bitwise logical operations
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
template <typename T> void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
}}} template <typename T>
void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
@ -843,20 +850,23 @@ namespace
{ {
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), OPENCV_DEVICE_NAMESPACE_ bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
dst.channels(), src, dst, stream);
} }
void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
{ {
using namespace cv::gpu; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
static Caller callers[] = {device::bitwiseMaskNotCaller<unsigned char>, device::bitwiseMaskNotCaller<unsigned char>,
device::bitwiseMaskNotCaller<unsigned short>, device::bitwiseMaskNotCaller<unsigned short>, static Caller callers[] =
device::bitwiseMaskNotCaller<unsigned int>, device::bitwiseMaskNotCaller<unsigned int>, {
device::bitwiseMaskNotCaller<unsigned int>}; bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>,
bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>,
bitwiseMaskNotCaller<unsigned int>
};
CV_Assert(mask.type() == CV_8U && mask.size() == src.size()); CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
@ -874,33 +884,33 @@ namespace
void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream) void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream)); bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
else else
::bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream)); bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Binary bitwise logical operations // Binary bitwise logical operations
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
template <typename T> void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); template <typename T>
void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
template <typename T> void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream); template <typename T>
void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
template <typename T> void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
}}}
template <typename T>
void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
@ -909,20 +919,22 @@ namespace
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), OPENCV_DEVICE_NAMESPACE_ bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
dst.channels(), src1, src2, dst, stream);
} }
void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
{ {
using namespace cv::gpu; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
static Caller callers[] = {device::bitwiseMaskOrCaller<unsigned char>, device::bitwiseMaskOrCaller<unsigned char>,
device::bitwiseMaskOrCaller<unsigned short>, device::bitwiseMaskOrCaller<unsigned short>, static Caller callers[] =
device::bitwiseMaskOrCaller<unsigned int>, device::bitwiseMaskOrCaller<unsigned int>, {
device::bitwiseMaskOrCaller<unsigned int>}; bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>,
bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>,
bitwiseMaskOrCaller<unsigned int>
};
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
@ -940,20 +952,23 @@ namespace
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), OPENCV_DEVICE_NAMESPACE_ bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
dst.channels(), src1, src2, dst, stream);
} }
void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
{ {
using namespace cv::gpu; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
static Caller callers[] = {device::bitwiseMaskAndCaller<unsigned char>, device::bitwiseMaskAndCaller<unsigned char>,
device::bitwiseMaskAndCaller<unsigned short>, device::bitwiseMaskAndCaller<unsigned short>, static Caller callers[] =
device::bitwiseMaskAndCaller<unsigned int>, device::bitwiseMaskAndCaller<unsigned int>, {
device::bitwiseMaskAndCaller<unsigned int>}; bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>,
bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>,
bitwiseMaskAndCaller<unsigned int>
};
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
@ -971,20 +986,23 @@ namespace
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), OPENCV_DEVICE_NAMESPACE_ bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
dst.channels(), src1, src2, dst, stream);
} }
void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
{ {
using namespace cv::gpu; using namespace OPENCV_DEVICE_NAMESPACE;
typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t); typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
static Caller callers[] = {device::bitwiseMaskXorCaller<unsigned char>, device::bitwiseMaskXorCaller<unsigned char>,
device::bitwiseMaskXorCaller<unsigned short>, device::bitwiseMaskXorCaller<unsigned short>, static Caller callers[] =
device::bitwiseMaskXorCaller<unsigned int>, device::bitwiseMaskXorCaller<unsigned int>, {
device::bitwiseMaskXorCaller<unsigned int>}; bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>,
bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>,
bitwiseMaskXorCaller<unsigned int>
};
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
@ -1001,47 +1019,48 @@ namespace
void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream)); bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
else else
::bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
} }
void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream)); bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
else else
::bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
} }
void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream) void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{ {
if (mask.empty()) if (mask.empty())
::bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream)); bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
else else
::bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream)); bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Minimum and maximum operations // Minimum and maximum operations
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T> template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream); void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T> template <typename T>
void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream); void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
template <typename T> template <typename T>
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream); void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
}}}
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
@ -1050,14 +1069,14 @@ namespace
{ {
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
} }
template <typename T> template <typename T>
void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream) void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{ {
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream); OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
} }
template <typename T> template <typename T>
@ -1065,14 +1084,14 @@ namespace
{ {
CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream); OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
} }
template <typename T> template <typename T>
void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream) void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
{ {
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream); OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
} }
} }
@ -1136,18 +1155,18 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// threshold // threshold
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T> template <typename T>
void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
cudaStream_t stream);
}}} END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream) template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
{ {
device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream); OPENCV_DEVICE_NAMESPACE_ threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
} }
} }
@ -1204,24 +1223,27 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// pow // pow
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template<typename T> template<typename T>
void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream); void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
}}}
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream) void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
{ {
CV_Assert( src.depth() != CV_64F ); using namespace OPENCV_DEVICE_NAMESPACE;
CV_Assert(src.depth() != CV_64F);
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
typedef void (*caller_t)(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
static const caller_t callers[] = static const caller_t callers[] =
{ {
device::pow_caller<unsigned char>, device::pow_caller<signed char>, pow_caller<unsigned char>, pow_caller<signed char>,
device::pow_caller<unsigned short>, device::pow_caller<short>, pow_caller<unsigned short>, pow_caller<short>,
device::pow_caller<int>, device::pow_caller<float> pow_caller<int>, pow_caller<float>
}; };
callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream)); callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));
@ -1230,14 +1252,17 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// addWeighted // addWeighted
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T1, typename T2, typename D> template <typename T1, typename T2, typename D>
void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream); void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
}}}
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream) void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE;
CV_Assert(src1.size() == src2.size()); CV_Assert(src1.size() == src2.size());
CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels())); CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
@ -1256,8 +1281,6 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,
typedef void (*caller_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
using namespace cv::gpu::device;
static const caller_t callers[7][7][7] = static const caller_t callers[7][7][7] =
{ {
{ {

View File

@ -735,14 +735,21 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// Separable Linear Filter // Separable Linear Filter
namespace cv { namespace gpu { namespace filters BEGIN_OPENCV_DEVICE_NAMESPACE
namespace row_filter
{ {
template <typename T, typename D> template <typename T, typename D>
void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}
namespace column_filter
{
template <typename T, typename D> template <typename T, typename D>
void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
@ -796,6 +803,8 @@ namespace
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType) Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ row_filter;
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R}; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4)) if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
@ -837,25 +846,25 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
switch (srcType) switch (srcType)
{ {
case CV_8UC1: case CV_8UC1:
func = filters::linearRowFilter_gpu<uchar, float>; func = linearRowFilter_gpu<uchar, float>;
break; break;
case CV_8UC4: case CV_8UC4:
func = filters::linearRowFilter_gpu<uchar4, float4>; func = linearRowFilter_gpu<uchar4, float4>;
break; break;
/*case CV_16SC1: /*case CV_16SC1:
func = filters::linearRowFilter_gpu<short, float>; func = linearRowFilter_gpu<short, float>;
break;*/ break;*/
/*case CV_16SC2: /*case CV_16SC2:
func = filters::linearRowFilter_gpu<short2, float2>; func = linearRowFilter_gpu<short2, float2>;
break;*/ break;*/
case CV_16SC3: case CV_16SC3:
func = filters::linearRowFilter_gpu<short3, float3>; func = linearRowFilter_gpu<short3, float3>;
break; break;
case CV_32SC1: case CV_32SC1:
func = filters::linearRowFilter_gpu<int, float>; func = linearRowFilter_gpu<int, float>;
break; break;
case CV_32FC1: case CV_32FC1:
func = filters::linearRowFilter_gpu<float, float>; func = linearRowFilter_gpu<float, float>;
break; break;
} }
@ -909,6 +918,8 @@ namespace
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType) Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ column_filter;
static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R}; static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4)) if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
@ -950,25 +961,25 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
switch (dstType) switch (dstType)
{ {
case CV_8UC1: case CV_8UC1:
func = filters::linearColumnFilter_gpu<float, uchar>; func = linearColumnFilter_gpu<float, uchar>;
break; break;
case CV_8UC4: case CV_8UC4:
func = filters::linearColumnFilter_gpu<float4, uchar4>; func = linearColumnFilter_gpu<float4, uchar4>;
break; break;
/*case CV_16SC1: /*case CV_16SC1:
func = filters::linearColumnFilter_gpu<float, short>; func = linearColumnFilter_gpu<float, short>;
break;*/ break;*/
/*case CV_16SC2: /*case CV_16SC2:
func = filters::linearColumnFilter_gpu<float2, short2>; func = linearColumnFilter_gpu<float2, short2>;
break;*/ break;*/
case CV_16SC3: case CV_16SC3:
func = filters::linearColumnFilter_gpu<float3, short3>; func = linearColumnFilter_gpu<float3, short3>;
break; break;
case CV_32SC1: case CV_32SC1:
func = filters::linearColumnFilter_gpu<float, int>; func = linearColumnFilter_gpu<float, int>;
break; break;
case CV_32FC1: case CV_32FC1:
func = filters::linearColumnFilter_gpu<float, float>; func = linearColumnFilter_gpu<float, float>;
break; break;
} }

View File

@ -1,863 +0,0 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
using namespace std;
cv::gpu::GpuMat::GpuMat(const GpuMat& m) :
flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)
{
if (refcount)
CV_XADD(refcount, 1);
}
cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(rows_), cols(cols_),
step(step_), data((uchar*)data_), refcount(0),
datastart((uchar*)data_), dataend((uchar*)data_)
{
size_t minstep = cols * elemSize();
if (step == Mat::AUTO_STEP)
{
step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
}
else
{
if (rows == 1)
step = minstep;
CV_DbgAssert(step >= minstep);
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
}
dataend += step * (rows - 1) + minstep;
}
cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(size_.height), cols(size_.width),
step(step_), data((uchar*)data_), refcount(0),
datastart((uchar*)data_), dataend((uchar*)data_)
{
size_t minstep = cols * elemSize();
if (step == Mat::AUTO_STEP)
{
step = minstep;
flags |= Mat::CONTINUOUS_FLAG;
}
else
{
if (rows == 1)
step = minstep;
CV_DbgAssert(step >= minstep);
flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
}
dataend += step * (rows - 1) + minstep;
}
cv::gpu::GpuMat::GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange)
{
flags = m.flags;
step = m.step; refcount = m.refcount;
data = m.data; datastart = m.datastart; dataend = m.dataend;
if (rowRange == Range::all())
rows = m.rows;
else
{
CV_Assert(0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows);
rows = rowRange.size();
data += step*rowRange.start;
}
if (colRange == Range::all())
cols = m.cols;
else
{
CV_Assert(0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols);
cols = colRange.size();
data += colRange.start*elemSize();
flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
}
if (rows == 1)
flags |= Mat::CONTINUOUS_FLAG;
if (refcount)
CV_XADD(refcount, 1);
if (rows <= 0 || cols <= 0)
rows = cols = 0;
}
cv::gpu::GpuMat::GpuMat(const GpuMat& m, const Rect& roi) :
flags(m.flags), rows(roi.height), cols(roi.width),
step(m.step), data(m.data + roi.y*step), refcount(m.refcount),
datastart(m.datastart), dataend(m.dataend)
{
flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
data += roi.x * elemSize();
CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows);
if (refcount)
CV_XADD(refcount, 1);
if (rows <= 0 || cols <= 0)
rows = cols = 0;
}
cv::gpu::GpuMat::GpuMat(const Mat& m) :
flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
{
upload(m);
}
GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m)
{
if (this != &m)
{
GpuMat temp(m);
swap(temp);
}
return *this;
}
void cv::gpu::GpuMat::swap(GpuMat& b)
{
std::swap(flags, b.flags);
std::swap(rows, b.rows);
std::swap(cols, b.cols);
std::swap(step, b.step);
std::swap(data, b.data);
std::swap(datastart, b.datastart);
std::swap(dataend, b.dataend);
std::swap(refcount, b.refcount);
}
void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const
{
size_t esz = elemSize();
ptrdiff_t delta1 = data - datastart;
ptrdiff_t delta2 = dataend - datastart;
CV_DbgAssert(step > 0);
if (delta1 == 0)
ofs.x = ofs.y = 0;
else
{
ofs.y = static_cast<int>(delta1 / step);
ofs.x = static_cast<int>((delta1 - step * ofs.y) / esz);
CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz);
}
size_t minstep = (ofs.x + cols) * esz;
wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / step + 1), ofs.y + rows);
wholeSize.width = std::max(static_cast<int>((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols);
}
GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)
{
Size wholeSize;
Point ofs;
locateROI(wholeSize, ofs);
size_t esz = elemSize();
int row1 = std::max(ofs.y - dtop, 0);
int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);
int col1 = std::max(ofs.x - dleft, 0);
int col2 = std::min(ofs.x + cols + dright, wholeSize.width);
data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;
rows = row2 - row1;
cols = col2 - col1;
if (esz * cols == step || rows == 1)
flags |= Mat::CONTINUOUS_FLAG;
else
flags &= ~Mat::CONTINUOUS_FLAG;
return *this;
}
GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
{
GpuMat hdr = *this;
int cn = channels();
if (new_cn == 0)
new_cn = cn;
int total_width = cols * cn;
if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
new_rows = rows * total_width / new_cn;
if (new_rows != 0 && new_rows != rows)
{
int total_size = total_width * rows;
if (!isContinuous())
CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
if ((unsigned)new_rows > (unsigned)total_size)
CV_Error(CV_StsOutOfRange, "Bad new number of rows");
total_width = total_size / new_rows;
if (total_width * new_rows != total_size)
CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
hdr.rows = new_rows;
hdr.step = total_width * elemSize1();
}
int new_width = total_width / new_cn;
if (new_width * new_cn != total_width)
CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");
hdr.cols = new_width;
hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
return hdr;
}
class GpuFuncTable
{
public:
virtual ~GpuFuncTable() {}
virtual void copy(const Mat& src, GpuMat& dst) const = 0;
virtual void copy(const GpuMat& src, Mat& dst) const = 0;
virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
virtual void setTo(GpuMat& m, const Scalar& s, const GpuMat& mask) const = 0;
virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
virtual void free(void* devPtr) const = 0;
};
#if !defined (HAVE_CUDA)
class EmptyFuncTable : public GpuFuncTable
{
public:
void copy(const Mat&, GpuMat&) const { throw_nogpu(); }
void copy(const GpuMat&, Mat&) const { throw_nogpu(); }
void copy(const GpuMat&, GpuMat&) const { throw_nogpu(); }
void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu(); }
void convert(const GpuMat&, GpuMat&) const { throw_nogpu(); }
void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu(); }
void setTo(GpuMat&, const Scalar&, const GpuMat&) const { throw_nogpu(); }
void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu(); }
void free(void*) const {}
};
const GpuFuncTable* gpuFuncTable()
{
static EmptyFuncTable empty;
return &empty;
}
#else
namespace cv { namespace gpu { namespace device
{
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
}}}
namespace
{
//////////////////////////////////////////////////////////////////////////
// Convert
template<int n> struct NPPTypeTraits;
template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; };
template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
template<int SDEPTH, int DDEPTH> struct NppConvertFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
};
template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
};
template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
{
device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
}
//////////////////////////////////////////////////////////////////////////
// Set
template<int SDEPTH, int SCN> struct NppSetFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, const Scalar& s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, const Scalar& s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T>
void kernelSet(GpuMat& src, const Scalar& s)
{
Scalar_<T> sf = s;
device::set_to_gpu(src, sf.val, src.channels(), 0);
}
template<int SDEPTH, int SCN> struct NppSetMaskFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T>
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
{
Scalar_<T> sf = s;
device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
}
}
class CudaFuncTable : public GpuFuncTable
{
public:
virtual void copy(const Mat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
}
virtual void copy(const GpuMat& src, Mat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
}
virtual void copy(const GpuMat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
}
virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
{
device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
}
void convert(const GpuMat& src, GpuMat& dst) const
{
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
static const caller_t callers[7][7][7] =
{
{
/* 8U -> 8U */ {0, 0, 0, 0},
/* 8U -> 8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},
/* 8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
/* 8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
/* 8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 8S -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 8S */ {0,0,0,0},
/* 8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 16U -> 8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
/* 16U -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 16U */ {0,0,0,0},
/* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 16S -> 8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
/* 16S -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 16S */ {0,0,0,0},
/* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 32S -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 32S */ {0,0,0,0},
/* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 32F -> 8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 32F */ {0,0,0,0},
/* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 64F -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 64F */ {0,0,0,0}
}
};
caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
CV_DbgAssert(func != 0);
func(src, dst);
}
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
{
device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
}
void setTo(GpuMat& m, const Scalar& s, const GpuMat& mask) const
{
NppiSize sz;
sz.width = m.cols;
sz.height = m.rows;
if (mask.empty())
{
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
{
cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
return;
}
if (m.depth() == CV_8U)
{
int cn = m.channels();
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<gpu::uchar>(s[0]);
cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
return;
}
}
typedef void (*caller_t)(GpuMat& src, const Scalar& s);
static const caller_t callers[7][4] =
{
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<gpu::uchar>,kernelSet<gpu::uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
{kernelSet<gpu::schar>,kernelSet<gpu::schar>,kernelSet<gpu::schar>,kernelSet<gpu::schar>},
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<gpu::ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
{kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}
};
callers[m.depth()][m.channels() - 1](m, s);
}
else
{
typedef void (*caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);
static const caller_t callers[7][4] =
{
{NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<gpu::uchar>,kernelSetMask<gpu::uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
{kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>},
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<gpu::ushort>,kernelSetMask<gpu::ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
{kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}
};
callers[m.depth()][m.channels() - 1](m, s, mask);
}
}
void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
{
cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
}
void free(void* devPtr) const
{
cudaFree(devPtr);
}
};
const GpuFuncTable* gpuFuncTable()
{
static CudaFuncTable cuda;
return &cuda;
}
#endif
void cv::gpu::GpuMat::upload(const Mat& m)
{
CV_DbgAssert(!m.empty());
create(m.size(), m.type());
gpuFuncTable()->copy(m, *this);
}
void cv::gpu::GpuMat::download(cv::Mat& m) const
{
CV_DbgAssert(!empty());
m.create(size(), type());
gpuFuncTable()->copy(*this, m);
}
void cv::gpu::GpuMat::copyTo(GpuMat& m) const
{
CV_DbgAssert(!empty());
m.create(size(), type());
gpuFuncTable()->copy(*this, m);
}
void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
{
if (mask.empty())
copyTo(mat);
else
{
mat.create(size(), type());
gpuFuncTable()->copyWithMask(*this, mat, mask);
}
}
void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const
{
bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();
if (rtype < 0)
rtype = type();
else
rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
int scn = channels();
int sdepth = depth();
int ddepth = CV_MAT_DEPTH(rtype);
if (sdepth == ddepth && noScale)
{
copyTo(dst);
return;
}
GpuMat temp;
const GpuMat* psrc = this;
if (sdepth != ddepth && psrc == &dst)
{
temp = *this;
psrc = &temp;
}
dst.create(size(), rtype);
if (noScale)
gpuFuncTable()->convert(*psrc, dst);
else
gpuFuncTable()->convert(*psrc, dst, alpha, beta);
}
GpuMat& cv::gpu::GpuMat::setTo(const Scalar& s, const GpuMat& mask)
{
CV_Assert(mask.empty() || mask.type() == CV_8UC1);
CV_DbgAssert(!empty());
gpuFuncTable()->setTo(*this, s, mask);
return *this;
}
void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
{
_type &= TYPE_MASK;
if (rows == _rows && cols == _cols && type() == _type && data)
return;
if (data)
release();
CV_DbgAssert(_rows >= 0 && _cols >= 0);
if (_rows > 0 && _cols > 0)
{
flags = Mat::MAGIC_VAL + _type;
rows = _rows;
cols = _cols;
size_t esz = elemSize();
void* devPtr;
gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows);
// Single row must be continuous
if (rows == 1)
step = esz * cols;
if (esz * cols == step)
flags |= Mat::CONTINUOUS_FLAG;
int64 _nettosize = static_cast<int64>(step) * rows;
size_t nettosize = static_cast<size_t>(_nettosize);
datastart = data = static_cast<uchar*>(devPtr);
dataend = data + nettosize;
refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));
*refcount = 1;
}
}
void cv::gpu::GpuMat::release()
{
if (refcount && CV_XADD(refcount, -1) == 1)
{
fastFree(refcount);
gpuFuncTable()->free(datastart);
}
data = datastart = dataend = 0;
step = rows = cols = 0;
refcount = 0;
}

View File

@ -60,40 +60,44 @@ std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector64x128() { throw_nog
#else #else
namespace cv { namespace gpu { namespace hog { BEGIN_OPENCV_DEVICE_NAMESPACE
void set_up_constants(int nbins, int block_stride_x, int block_stride_y, namespace hog
{
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y); int nblocks_win_x, int nblocks_win_y);
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y, void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
int height, int width, const cv::gpu::DevMem2Df& grad, int height, int width, const cv::gpu::DevMem2Df& grad,
const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists); const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);
void normalize_hists(int nbins, int block_stride_x, int block_stride_y, void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
int height, int width, float* block_hists, float threshold); int height, int width, float* block_hists, float threshold);
void classify_hists(int win_height, int win_width, int block_stride_y, void classify_hists(int win_height, int win_width, int block_stride_y,
int block_stride_x, int win_stride_y, int win_stride_x, int height, int block_stride_x, int win_stride_y, int win_stride_x, int height,
int width, float* block_hists, float* coefs, float free_coef, int width, float* block_hists, float* coefs, float free_coef,
float threshold, unsigned char* labels); float threshold, unsigned char* labels);
void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
cv::gpu::DevMem2Df descriptors); cv::gpu::DevMem2Df descriptors);
void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
int win_stride_y, int win_stride_x, int height, int width, float* block_hists, int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
cv::gpu::DevMem2Df descriptors); cv::gpu::DevMem2Df descriptors);
void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma); float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img,
float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma); float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst); void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst); void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
}
}}} END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE;
cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size, cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size,
int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels) int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels)

View File

@ -107,15 +107,20 @@ void cv::gpu::CannyBuf::release() { throw_nogpu(); }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// remap // remap
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace remap
{ {
template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, template <typename T>
void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst,
int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc); int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream) void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream)
{ {
using namespace cv::gpu::imgproc; using namespace OPENCV_DEVICE_NAMESPACE_ remap;
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation,
int borderMode, const float* borderValue, cudaStream_t stream, int cc); int borderMode, const float* borderValue, cudaStream_t stream, int cc);
@ -155,13 +160,19 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// meanShiftFiltering_GPU // meanShiftFiltering_GPU
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream); void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream) void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
if( src.empty() ) if( src.empty() )
CV_Error( CV_StsBadArg, "The input image is empty" ); CV_Error( CV_StsBadArg, "The input image is empty" );
@ -180,19 +191,25 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
eps = 1.f; eps = 1.f;
eps = (float)std::max(criteria.epsilon, 0.0); eps = (float)std::max(criteria.epsilon, 0.0);
imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream)); meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// meanShiftProc_GPU // meanShiftProc_GPU
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream); void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream) void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
if( src.empty() ) if( src.empty() )
CV_Error( CV_StsBadArg, "The input image is empty" ); CV_Error( CV_StsBadArg, "The input image is empty" );
@ -212,26 +229,32 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
eps = 1.f; eps = 1.f;
eps = (float)std::max(criteria.epsilon, 0.0); eps = (float)std::max(criteria.epsilon, 0.0);
imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream)); meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// drawColorDisp // drawColorDisp
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream); void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream); void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
template <typename T> template <typename T>
void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream) void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
dst.create(src.size(), CV_8UC4); dst.create(src.size(), CV_8UC4);
imgproc::drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream); drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream);
} }
typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream); typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);
@ -249,19 +272,26 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// reprojectImageTo3D // reprojectImageTo3D
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream); void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream); void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
template <typename T> template <typename T>
void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream) void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
xyzw.create(disp.rows, disp.cols, CV_32FC4); xyzw.create(disp.rows, disp.cols, CV_32FC4);
imgproc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
} }
typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream); typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream);
@ -279,10 +309,14 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// resize // resize
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace resize
{ {
template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s) void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
{ {
@ -346,7 +380,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
} }
else else
{ {
using namespace cv::gpu::imgproc; using namespace OPENCV_DEVICE_NAMESPACE_ resize;
typedef void (*caller_t)(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream); typedef void (*caller_t)(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
static const caller_t callers[6][4] = static const caller_t callers[6][4] =
@ -366,18 +400,24 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// copyMakeBorder // copyMakeBorder
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace copy_make_border
{ {
template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream); template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream) template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ copy_make_border;
Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3])); Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
imgproc::copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream); copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
} }
} }
@ -626,16 +666,22 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// buildWarpPlaneMaps // buildWarpPlaneMaps
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y, void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
const float k_rinv[9], const float r_kinv[9], const float t[3], float scale, const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
cudaStream_t stream); cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T,
float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream) float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F); CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F); CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
CV_Assert((T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32F && T.isContinuous()); CV_Assert((T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32F && T.isContinuous());
@ -647,23 +693,29 @@ void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, cons
map_x.create(dst_roi.size(), CV_32F); map_x.create(dst_roi.size(), CV_32F);
map_y.create(dst_roi.size(), CV_32F); map_y.create(dst_roi.size(), CV_32F);
imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
T.ptr<float>(), scale, StreamAccessor::getStream(stream)); T.ptr<float>(), scale, StreamAccessor::getStream(stream));
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// buildWarpCylyndricalMaps // buildWarpCylyndricalMaps
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y, void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
const float k_rinv[9], const float r_kinv[9], float scale, const float k_rinv[9], const float r_kinv[9], float scale,
cudaStream_t stream); cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale, void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
GpuMat& map_x, GpuMat& map_y, Stream& stream) GpuMat& map_x, GpuMat& map_y, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F); CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F); CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
@ -674,24 +726,29 @@ void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K
map_x.create(dst_roi.size(), CV_32F); map_x.create(dst_roi.size(), CV_32F);
map_y.create(dst_roi.size(), CV_32F); map_y.create(dst_roi.size(), CV_32F);
imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
scale, StreamAccessor::getStream(stream));
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// buildWarpSphericalMaps // buildWarpSphericalMaps
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y, void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
const float k_rinv[9], const float r_kinv[9], float scale, const float k_rinv[9], const float r_kinv[9], float scale,
cudaStream_t stream); cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale, void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
GpuMat& map_x, GpuMat& map_y, Stream& stream) GpuMat& map_x, GpuMat& map_y, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F); CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F); CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
@ -702,8 +759,7 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K,
map_x.create(dst_roi.size(), CV_32F); map_x.create(dst_roi.size(), CV_32F);
map_y.create(dst_roi.size(), CV_32F); map_y.create(dst_roi.size(), CV_32F);
imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
scale, StreamAccessor::getStream(stream));
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -843,17 +899,24 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// columnSum // columnSum
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void columnSum_32F(const DevMem2Db src, const DevMem2Db dst); void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst) void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
CV_Assert(src.type() == CV_32F); CV_Assert(src.type() == CV_32F);
dst.create(src.size(), CV_32F); dst.create(src.size(), CV_32F);
imgproc::columnSum_32F(src, dst);
columnSum_32F(src, dst);
} }
void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s) void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
@ -1140,7 +1203,6 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, S
histRange(src, hist, levels, buf, stream); histRange(src, hist, levels, buf, stream);
} }
void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream) void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream)
{ {
CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1); CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1);
@ -1183,13 +1245,19 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream)); hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
} }
namespace cv { namespace gpu { namespace histograms BEGIN_OPENCV_DEVICE_NAMESPACE
namespace hist
{ {
void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream); void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);
const int PARTIAL_HISTOGRAM256_COUNT = 240; const int PARTIAL_HISTOGRAM256_COUNT = 240;
const int HISTOGRAM256_BIN_COUNT = 256; const int HISTOGRAM256_BIN_COUNT = 256;
}}}
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
}
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream) void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
{ {
@ -1199,7 +1267,7 @@ void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream) void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
{ {
using namespace cv::gpu::histograms; using namespace OPENCV_DEVICE_NAMESPACE_ hist;
CV_Assert(src.type() == CV_8UC1); CV_Assert(src.type() == CV_8UC1);
@ -1223,14 +1291,9 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream&
equalizeHist(src, dst, hist, buf, stream); equalizeHist(src, dst, hist, buf, stream);
} }
namespace cv { namespace gpu { namespace histograms
{
void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
}}}
void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s) void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
{ {
using namespace cv::gpu::histograms; using namespace OPENCV_DEVICE_NAMESPACE_ hist;
CV_Assert(src.type() == CV_8UC1); CV_Assert(src.type() == CV_8UC1);
@ -1264,13 +1327,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// cornerHarris & minEgenVal // cornerHarris & minEgenVal
namespace cv { namespace gpu { namespace imgproc { BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{
void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream); void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream); void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream); void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
}
}}} END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
@ -1316,7 +1382,6 @@ namespace
} // Anonymous namespace } // Anonymous namespace
bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType) bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
{ {
switch (cpuBorderType) switch (cpuBorderType)
@ -1356,6 +1421,8 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream) void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
CV_Assert(borderType == cv::BORDER_REFLECT101 || CV_Assert(borderType == cv::BORDER_REFLECT101 ||
borderType == cv::BORDER_REPLICATE); borderType == cv::BORDER_REPLICATE);
@ -1364,7 +1431,7 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D
extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
dst.create(src.size(), CV_32F); dst.create(src.size(), CV_32F);
imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
} }
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType) void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
@ -1381,6 +1448,8 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream) void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
CV_Assert(borderType == cv::BORDER_REFLECT101 || CV_Assert(borderType == cv::BORDER_REFLECT101 ||
borderType == cv::BORDER_REPLICATE); borderType == cv::BORDER_REPLICATE);
@ -1389,24 +1458,30 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM
extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream); extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
dst.create(src.size(), CV_32F); dst.create(src.size(), CV_32F);
imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream)); cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// mulSpectrums // mulSpectrums
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream); void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream); void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream); typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ };
static Caller callers[] = { mulSpectrums, mulSpectrums_CONJ };
CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
CV_Assert(a.size() == b.size()); CV_Assert(a.size() == b.size());
@ -1420,18 +1495,23 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// mulAndScaleSpectrums // mulAndScaleSpectrums
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream); void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream); void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream); typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ }; static Caller callers[] = { mulAndScaleSpectrums, mulAndScaleSpectrums_CONJ };
CV_Assert(a.type() == b.type() && a.type() == CV_32FC2); CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
CV_Assert(a.size() == b.size()); CV_Assert(a.size() == b.size());
@ -1593,13 +1673,19 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
convolve(image, templ, result, ccorr, buf); convolve(image, templ, result, ccorr, buf);
} }
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace imgproc
{ {
void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream); void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream) void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
#ifndef HAVE_CUFFT #ifndef HAVE_CUFFT
CV_Assert(image.type() == CV_32F); CV_Assert(image.type() == CV_32F);
@ -1622,7 +1708,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
templ.copyTo(contKernel); templ.copyTo(contKernel);
} }
imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream)); convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
#else #else
@ -1650,7 +1736,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
templ.copyTo(contKernel); templ.copyTo(contKernel);
} }
imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream)); convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
} }
else else
{ {
@ -1725,14 +1811,18 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// pyrDown // pyrDown
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace pyr_down
{ {
template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream) void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
{ {
using namespace cv::gpu::imgproc; using namespace OPENCV_DEVICE_NAMESPACE_ pyr_down;
typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
@ -1761,14 +1851,18 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& st
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// pyrUp // pyrUp
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace pyr_up
{ {
template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream) void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
{ {
using namespace cv::gpu::imgproc; using namespace OPENCV_DEVICE_NAMESPACE_ pyr_up;
typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream); typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
@ -1839,7 +1933,9 @@ void cv::gpu::CannyBuf::release()
trackBuf2.release(); trackBuf2.release();
} }
namespace cv { namespace gpu { namespace canny BEGIN_OPENCV_DEVICE_NAMESPACE
namespace canny
{ {
void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols); void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);
@ -1853,13 +1949,15 @@ namespace cv { namespace gpu { namespace canny
void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols); void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);
void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols); void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
}}} }
END_OPENCV_DEVICE_NAMESPACE
namespace namespace
{ {
void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh) void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
{ {
using namespace cv::gpu::canny; using namespace OPENCV_DEVICE_NAMESPACE_ canny;
calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh); calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
@ -1879,7 +1977,7 @@ void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double hi
void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient) void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
{ {
using namespace cv::gpu::canny; using namespace OPENCV_DEVICE_NAMESPACE_ canny;
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
CV_Assert(src.type() == CV_8UC1); CV_Assert(src.type() == CV_8UC1);
@ -1918,7 +2016,7 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_
void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient) void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
{ {
using namespace cv::gpu::canny; using namespace OPENCV_DEVICE_NAMESPACE_ canny;
CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS)); CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size()); CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());

View File

@ -271,5 +271,380 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
setDevice(prev_device_id); setDevice(prev_device_id);
} }
////////////////////////////////////////////////////////////////////
// GpuFuncTable
BEGIN_OPENCV_DEVICE_NAMESPACE
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
END_OPENCV_DEVICE_NAMESPACE
namespace
{
//////////////////////////////////////////////////////////////////////////
// Convert
template<int n> struct NPPTypeTraits;
template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; };
template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
template<int SDEPTH, int DDEPTH> struct NppConvertFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
};
template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
};
template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
{
OPENCV_DEVICE_NAMESPACE_ convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
}
//////////////////////////////////////////////////////////////////////////
// Set
template<int SDEPTH, int SCN> struct NppSetFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T>
void kernelSet(GpuMat& src, Scalar s)
{
Scalar_<T> sf = s;
OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, src.channels(), 0);
}
template<int SDEPTH, int SCN> struct NppSetMaskFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T>
void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)
{
Scalar_<T> sf = s;
OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, mask, src.channels(), 0);
}
class CudaFuncTable : public GpuFuncTable
{
public:
void copy(const Mat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
}
void copy(const GpuMat& src, Mat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
}
void copy(const GpuMat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
}
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
{
OPENCV_DEVICE_NAMESPACE_ copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
}
void convert(const GpuMat& src, GpuMat& dst) const
{
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
static const caller_t callers[7][7][7] =
{
{
/* 8U -> 8U */ {0, 0, 0, 0},
/* 8U -> 8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},
/* 8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
/* 8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
/* 8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 8S -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 8S */ {0,0,0,0},
/* 8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 16U -> 8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
/* 16U -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 16U */ {0,0,0,0},
/* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 16S -> 8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
/* 16S -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 16S */ {0,0,0,0},
/* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 32S -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 32S */ {0,0,0,0},
/* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 32F -> 8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 32F */ {0,0,0,0},
/* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 64F -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 64F */ {0,0,0,0}
}
};
caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
CV_DbgAssert(func != 0);
func(src, dst);
}
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
{
device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
}
void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
{
NppiSize sz;
sz.width = m.cols;
sz.height = m.rows;
if (mask.empty())
{
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
{
cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
return;
}
if (m.depth() == CV_8U)
{
int cn = m.channels();
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<uchar>(s[0]);
cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
return;
}
}
typedef void (*caller_t)(GpuMat& src, Scalar s);
static const caller_t callers[7][4] =
{
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
{kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
{kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}
};
callers[m.depth()][m.channels() - 1](m, s);
}
else
{
typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask);
static const caller_t callers[7][4] =
{
{NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
{kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
{kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}
};
callers[m.depth()][m.channels() - 1](m, s, mask);
}
}
void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
{
cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
}
void free(void* devPtr) const
{
cudaFree(devPtr);
}
};
class Initializer
{
public:
Initializer()
{
static CudaFuncTable funcTable;
setGpuFuncTable(&funcTable);
}
};
Initializer init;
}
#endif #endif

View File

@ -44,6 +44,7 @@
using namespace cv; using namespace cv;
using namespace cv::gpu; using namespace cv::gpu;
using namespace std;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
@ -51,7 +52,9 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&)
#else #else
namespace cv { namespace gpu { namespace imgproc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace match_template
{ {
void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream); void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream); void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
@ -132,8 +135,11 @@ namespace cv { namespace gpu { namespace imgproc
unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream); unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);
void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream); void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE_ match_template;
namespace namespace
{ {
@ -177,14 +183,14 @@ namespace
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F)) if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))
{ {
imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return; return;
} }
GpuMat result_; GpuMat result_;
ConvolveBuf buf; ConvolveBuf buf;
convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream); convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream);
imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream)); extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
} }
@ -193,7 +199,7 @@ namespace
if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U)) if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U))
{ {
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return; return;
} }
@ -220,15 +226,14 @@ namespace
sqrIntegral(image.reshape(1), img_sqsum, stream); sqrIntegral(image.reshape(1), img_sqsum, stream);
unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0]; unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
result, image.channels(), StreamAccessor::getStream(stream));
} }
void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
{ {
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
} }
@ -237,7 +242,7 @@ namespace
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U)) if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U))
{ {
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
return; return;
} }
@ -247,8 +252,7 @@ namespace
unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0]; unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
matchTemplate_CCORR_8U(image, templ, result, stream); matchTemplate_CCORR_8U(image, templ, result, stream);
imgproc::matchTemplatePrepared_SQDIFF_8U( matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
} }
@ -260,8 +264,7 @@ namespace
unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0]; unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
matchTemplate_CCORR_8U(image, templ, result, stream); matchTemplate_CCORR_8U(image, templ, result, stream);
imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U( matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
} }
@ -275,13 +278,12 @@ namespace
integral(image, image_sum, stream); integral(image, image_sum, stream);
unsigned int templ_sum = (unsigned int)sum(templ)[0]; unsigned int templ_sum = (unsigned int)sum(templ)[0];
imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sum, templ_sum, result, StreamAccessor::getStream(stream));
image_sum, templ_sum, result, StreamAccessor::getStream(stream));
} }
else else
{ {
std::vector<GpuMat> images; vector<GpuMat> images;
std::vector<GpuMat> image_sums(image.channels()); vector<GpuMat> image_sums(image.channels());
split(image, images); split(image, images);
for (int i = 0; i < image.channels(); ++i) for (int i = 0; i < image.channels(); ++i)
@ -292,19 +294,19 @@ namespace
switch (image.channels()) switch (image.channels())
{ {
case 2: case 2:
imgproc::matchTemplatePrepared_CCOFF_8UC2( matchTemplatePrepared_CCOFF_8UC2(
templ.cols, templ.rows, image_sums[0], image_sums[1], templ.cols, templ.rows, image_sums[0], image_sums[1],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 3: case 3:
imgproc::matchTemplatePrepared_CCOFF_8UC3( matchTemplatePrepared_CCOFF_8UC3(
templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 4: case 4:
imgproc::matchTemplatePrepared_CCOFF_8UC4( matchTemplatePrepared_CCOFF_8UC4(
templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3], templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3],
(unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
(unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream)); (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
@ -341,15 +343,15 @@ namespace
unsigned int templ_sum = (unsigned int)sum(templ)[0]; unsigned int templ_sum = (unsigned int)sum(templ)[0];
unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0]; unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];
imgproc::matchTemplatePrepared_CCOFF_NORMED_8U( matchTemplatePrepared_CCOFF_NORMED_8U(
templ.cols, templ.rows, image_sum, image_sqsum, templ.cols, templ.rows, image_sum, image_sqsum,
templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream)); templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
} }
else else
{ {
std::vector<GpuMat> images; vector<GpuMat> images;
std::vector<GpuMat> image_sums(image.channels()); vector<GpuMat> image_sums(image.channels());
std::vector<GpuMat> image_sqsums(image.channels()); vector<GpuMat> image_sqsums(image.channels());
split(image, images); split(image, images);
for (int i = 0; i < image.channels(); ++i) for (int i = 0; i < image.channels(); ++i)
@ -364,7 +366,7 @@ namespace
switch (image.channels()) switch (image.channels())
{ {
case 2: case 2:
imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC2( matchTemplatePrepared_CCOFF_NORMED_8UC2(
templ.cols, templ.rows, templ.cols, templ.rows,
image_sums[0], image_sqsums[0], image_sums[0], image_sqsums[0],
image_sums[1], image_sqsums[1], image_sums[1], image_sqsums[1],
@ -373,7 +375,7 @@ namespace
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 3: case 3:
imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3( matchTemplatePrepared_CCOFF_NORMED_8UC3(
templ.cols, templ.rows, templ.cols, templ.rows,
image_sums[0], image_sqsums[0], image_sums[0], image_sqsums[0],
image_sums[1], image_sqsums[1], image_sums[1], image_sqsums[1],
@ -384,7 +386,7 @@ namespace
result, StreamAccessor::getStream(stream)); result, StreamAccessor::getStream(stream));
break; break;
case 4: case 4:
imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4( matchTemplatePrepared_CCOFF_NORMED_8UC4(
templ.cols, templ.rows, templ.cols, templ.rows,
image_sums[0], image_sqsums[0], image_sums[0], image_sqsums[0],
image_sums[1], image_sqsums[1], image_sums[1], image_sqsums[1],

View File

@ -45,6 +45,139 @@
using namespace cv; using namespace cv;
using namespace cv::gpu; using namespace cv::gpu;
cv::gpu::CudaMem::CudaMem()
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
{
}
cv::gpu::CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
{
if( _rows > 0 && _cols > 0 )
create( _rows, _cols, _type, _alloc_type);
}
cv::gpu::CudaMem::CudaMem(Size _size, int _type, int _alloc_type)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
{
if( _size.height > 0 && _size.width > 0 )
create( _size.height, _size.width, _type, _alloc_type);
}
cv::gpu::CudaMem::CudaMem(const CudaMem& m)
: flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
{
if( refcount )
CV_XADD(refcount, 1);
}
cv::gpu::CudaMem::CudaMem(const Mat& m, int _alloc_type)
: flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
{
if( m.rows > 0 && m.cols > 0 )
create( m.size(), m.type(), _alloc_type);
Mat tmp = createMatHeader();
m.copyTo(tmp);
}
cv::gpu::CudaMem::~CudaMem()
{
release();
}
CudaMem& cv::gpu::CudaMem::operator = (const CudaMem& m)
{
if( this != &m )
{
if( m.refcount )
CV_XADD(m.refcount, 1);
release();
flags = m.flags;
rows = m.rows; cols = m.cols;
step = m.step; data = m.data;
datastart = m.datastart;
dataend = m.dataend;
refcount = m.refcount;
alloc_type = m.alloc_type;
}
return *this;
}
CudaMem cv::gpu::CudaMem::clone() const
{
CudaMem m(size(), type(), alloc_type);
Mat to = m;
Mat from = *this;
from.copyTo(to);
return m;
}
void cv::gpu::CudaMem::create(Size _size, int _type, int _alloc_type)
{
create(_size.height, _size.width, _type, _alloc_type);
}
Mat cv::gpu::CudaMem::createMatHeader() const
{
return Mat(size(), type(), data, step);
}
cv::gpu::CudaMem::operator Mat() const
{
return createMatHeader();
}
cv::gpu::CudaMem::operator GpuMat() const
{
return createGpuMatHeader();
}
bool cv::gpu::CudaMem::isContinuous() const
{
return (flags & Mat::CONTINUOUS_FLAG) != 0;
}
size_t cv::gpu::CudaMem::elemSize() const
{
return CV_ELEM_SIZE(flags);
}
size_t cv::gpu::CudaMem::elemSize1() const
{
return CV_ELEM_SIZE1(flags);
}
int cv::gpu::CudaMem::type() const
{
return CV_MAT_TYPE(flags);
}
int cv::gpu::CudaMem::depth() const
{
return CV_MAT_DEPTH(flags);
}
int cv::gpu::CudaMem::channels() const
{
return CV_MAT_CN(flags);
}
size_t cv::gpu::CudaMem::step1() const
{
return step/elemSize1();
}
Size cv::gpu::CudaMem::size() const
{
return Size(cols, rows);
}
bool cv::gpu::CudaMem::empty() const
{
return data == 0;
}
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); } void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }

View File

@ -190,8 +190,12 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Sum // Sum
namespace cv { namespace gpu { namespace mathfunc BEGIN_OPENCV_DEVICE_NAMESPACE
namespace matrix_reductions
{ {
namespace sum
{
template <typename T> template <typename T>
void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn); void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
@ -210,12 +214,11 @@ namespace cv { namespace gpu { namespace mathfunc
template <typename T> template <typename T>
void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn); void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
namespace sums
{
void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
} }
}}} }
END_OPENCV_DEVICE_NAMESPACE
Scalar cv::gpu::sum(const GpuMat& src) Scalar cv::gpu::sum(const GpuMat& src)
{ {
@ -226,23 +229,25 @@ Scalar cv::gpu::sum(const GpuMat& src)
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{ {
using namespace mathfunc; using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int); typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
static Caller multipass_callers[7] = { static Caller multipass_callers[7] =
{
sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
sumMultipassCaller<int>, sumMultipassCaller<float>, 0 }; sumMultipassCaller<int>, sumMultipassCaller<float>, 0
};
static Caller singlepass_callers[7] = { static Caller singlepass_callers[7] = {
sumCaller<unsigned char>, sumCaller<char>, sumCaller<unsigned char>, sumCaller<char>,
sumCaller<unsigned short>, sumCaller<short>, sumCaller<unsigned short>, sumCaller<short>,
sumCaller<int>, sumCaller<float>, 0 }; sumCaller<int>, sumCaller<float>, 0
};
Size buf_size; Size buf_size;
sums::getBufSizeRequired(src.cols, src.rows, src.channels(), getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller* callers = multipass_callers; Caller* callers = multipass_callers;
@ -267,23 +272,26 @@ Scalar cv::gpu::absSum(const GpuMat& src)
Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
{ {
using namespace mathfunc; using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int); typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
static Caller multipass_callers[7] = { static Caller multipass_callers[7] =
{
absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 }; absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0
};
static Caller singlepass_callers[7] = { static Caller singlepass_callers[7] =
{
absSumCaller<unsigned char>, absSumCaller<char>, absSumCaller<unsigned char>, absSumCaller<char>,
absSumCaller<unsigned short>, absSumCaller<short>, absSumCaller<unsigned short>, absSumCaller<short>,
absSumCaller<int>, absSumCaller<float>, 0 }; absSumCaller<int>, absSumCaller<float>, 0
};
Size buf_size; Size buf_size;
sums::getBufSizeRequired(src.cols, src.rows, src.channels(), getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller* callers = multipass_callers; Caller* callers = multipass_callers;
@ -308,27 +316,30 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{ {
using namespace mathfunc; using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;
typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int); typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);
static Caller multipass_callers[7] = { static Caller multipass_callers[7] =
{
sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 }; sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0
};
static Caller singlepass_callers[7] = { static Caller singlepass_callers[7] =
{
sqrSumCaller<unsigned char>, sqrSumCaller<char>, sqrSumCaller<unsigned char>, sqrSumCaller<char>,
sqrSumCaller<unsigned short>, sqrSumCaller<short>, sqrSumCaller<unsigned short>, sqrSumCaller<short>,
sqrSumCaller<int>, sqrSumCaller<float>, 0 }; sqrSumCaller<int>, sqrSumCaller<float>, 0
};
Caller* callers = multipass_callers; Caller* callers = multipass_callers;
if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS)) if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
callers = singlepass_callers; callers = singlepass_callers;
Size buf_size; Size buf_size;
sums::getBufSizeRequired(src.cols, src.rows, src.channels(), getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
Caller caller = callers[src.depth()]; Caller caller = callers[src.depth()];
@ -339,14 +350,15 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
return Scalar(result[0], result[1], result[2], result[3]); return Scalar(result[0], result[1], result[2], result[3]);
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Find min or max // Find min or max
namespace cv { namespace gpu { namespace mathfunc { namespace minmax { BEGIN_OPENCV_DEVICE_NAMESPACE
namespace matrix_reductions
{
namespace minmax
{
void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
template <typename T> template <typename T>
@ -360,8 +372,10 @@ namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
template <typename T> template <typename T>
void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf); void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
}
}
}}}} END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask) void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
@ -373,39 +387,43 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf) void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{ {
using namespace mathfunc::minmax; using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmax;
typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb); typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb); typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);
static Caller multipass_callers[7] = { static Caller multipass_callers[7] =
{
minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 }; minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
};
static Caller singlepass_callers[7] = { static Caller singlepass_callers[7] =
{
minMaxCaller<unsigned char>, minMaxCaller<char>, minMaxCaller<unsigned char>, minMaxCaller<char>,
minMaxCaller<unsigned short>, minMaxCaller<short>, minMaxCaller<unsigned short>, minMaxCaller<short>,
minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> }; minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
};
static MaskedCaller masked_multipass_callers[7] = { static MaskedCaller masked_multipass_callers[7] =
{
minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>, minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 }; minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
};
static MaskedCaller masked_singlepass_callers[7] = { static MaskedCaller masked_singlepass_callers[7] =
{
minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
minMaxMaskCaller<double> }; };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) &&
DeviceInfo().supports(NATIVE_DOUBLE)));
double minVal_; if (!minVal) minVal = &minVal_; double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_;
@ -439,8 +457,12 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Locate min and max // Locate min and max
namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { BEGIN_OPENCV_DEVICE_NAMESPACE
namespace matrix_reductions
{
namespace minmaxloc
{
void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows); int& b1rows, int& b2cols, int& b2rows);
@ -459,8 +481,10 @@ namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
template <typename T> template <typename T>
void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf); int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
}}}} }
}
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask) void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
{ {
@ -468,43 +492,46 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf); minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
} }
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf) const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
{ {
using namespace mathfunc::minmaxloc; using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmaxloc;
typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb); typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb); typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
static Caller multipass_callers[7] = { static Caller multipass_callers[7] =
{
minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 }; minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
};
static Caller singlepass_callers[7] = { static Caller singlepass_callers[7] =
{
minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> }; minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
};
static MaskedCaller masked_multipass_callers[7] = { static MaskedCaller masked_multipass_callers[7] =
{
minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>, minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 }; minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
};
static MaskedCaller masked_singlepass_callers[7] = { static MaskedCaller masked_singlepass_callers[7] =
{
minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
minMaxLocMaskCaller<double> }; };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size())); CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) &&
DeviceInfo().supports(NATIVE_DOUBLE)));
double minVal_; if (!minVal) minVal = &minVal_; double minVal_; if (!minVal) minVal = &minVal_;
double maxVal_; if (!maxVal) maxVal = &maxVal_; double maxVal_; if (!maxVal) maxVal = &maxVal_;
int minLoc_[2]; int minLoc_[2];
@ -544,8 +571,12 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Count non-zero elements // Count non-zero elements
namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero { BEGIN_OPENCV_DEVICE_NAMESPACE
namespace matrix_reductions
{
namespace countnonzero
{
void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows); void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
template <typename T> template <typename T>
@ -553,9 +584,10 @@ namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
template <typename T> template <typename T>
int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf); int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
}
}
}}}} END_OPENCV_DEVICE_NAMESPACE
int cv::gpu::countNonZero(const GpuMat& src) int cv::gpu::countNonZero(const GpuMat& src)
{ {
@ -566,26 +598,25 @@ int cv::gpu::countNonZero(const GpuMat& src)
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf) int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{ {
using namespace mathfunc::countnonzero; using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::countnonzero;
typedef int (*Caller)(const DevMem2Db src, PtrStepb buf); typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);
static Caller multipass_callers[7] = { static Caller multipass_callers[7] =
{
countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>, countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>, countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 }; countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0
};
static Caller singlepass_callers[7] = { static Caller singlepass_callers[7] =
{
countNonZeroCaller<unsigned char>, countNonZeroCaller<char>, countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
countNonZeroCaller<unsigned short>, countNonZeroCaller<short>, countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
countNonZeroCaller<double> };
CV_Assert(src.channels() == 1); CV_Assert(src.channels() == 1);
CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) &&
DeviceInfo().supports(NATIVE_DOUBLE)));
Size buf_size; Size buf_size;
getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height); getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf); ensureSizeIsEnough(buf_size, CV_8U, buf);
@ -601,15 +632,20 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// reduce // reduce
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace cv { namespace gpu { namespace mathfunc { namespace matrix_reductions
{
template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream) void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
{ {
using namespace cv::gpu::mathfunc; using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions;
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F); CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
CV_Assert(dim == 0 || dim == 1); CV_Assert(dim == 0 || dim == 1);
CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN); CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);

View File

@ -234,10 +234,10 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
const int hsp = sp; const int hsp = sp;
// Perform mean shift procedure and obtain region and spatial maps // Perform mean shift procedure and obtain region and spatial maps
GpuMat h_rmap, h_spmap; GpuMat d_rmap, d_spmap;
meanShiftProc(src, h_rmap, h_spmap, sp, sr, criteria); meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
Mat rmap = h_rmap; Mat rmap(d_rmap);
Mat spmap = h_spmap; Mat spmap(d_spmap);
Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1) Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
+ (nrows - 1) + (ncols - 1)); + (nrows - 1) + (ncols - 1));
@ -352,7 +352,7 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
} }
// Compute sum of the pixel's colors which are in the same segment // Compute sum of the pixel's colors which are in the same segment
Mat h_src = src; Mat h_src(src);
vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0)); vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
for (int y = 0; y < nrows; ++y) for (int y = 0; y < nrows; ++y)
{ {

View File

@ -43,17 +43,18 @@
#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ #ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ #define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
#include "internal_shared.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "vec_math.hpp" #include "vec_math.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
//////////////////////////////////////////////////////////////
// BrdConstant
template <typename D> struct BrdRowConstant //////////////////////////////////////////////////////////////
{ // BrdConstant
template <typename D> struct BrdRowConstant
{
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {} explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
@ -75,10 +76,10 @@ namespace cv { namespace gpu { namespace device
const int width; const int width;
const D val; const D val;
}; };
template <typename D> struct BrdColConstant template <typename D> struct BrdColConstant
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {} explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
@ -100,10 +101,10 @@ namespace cv { namespace gpu { namespace device
const int height; const int height;
const D val; const D val;
}; };
template <typename D> struct BrdConstant template <typename D> struct BrdConstant
{ {
typedef D result_type; typedef D result_type;
__host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_) __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : height(height_), width(width_), val(val_)
@ -123,13 +124,13 @@ namespace cv { namespace gpu { namespace device
const int height; const int height;
const int width; const int width;
const D val; const D val;
}; };
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// BrdReplicate // BrdReplicate
template <typename D> struct BrdRowReplicate template <typename D> struct BrdRowReplicate
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {} explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
@ -166,10 +167,10 @@ namespace cv { namespace gpu { namespace device
} }
const int last_col; const int last_col;
}; };
template <typename D> struct BrdColReplicate template <typename D> struct BrdColReplicate
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {} explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
@ -206,10 +207,10 @@ namespace cv { namespace gpu { namespace device
} }
const int last_row; const int last_row;
}; };
template <typename D> struct BrdReplicate template <typename D> struct BrdReplicate
{ {
typedef D result_type; typedef D result_type;
__host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {} __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : last_row(height - 1), last_col(width - 1) {}
@ -257,13 +258,13 @@ namespace cv { namespace gpu { namespace device
const int last_row; const int last_row;
const int last_col; const int last_col;
}; };
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// BrdReflect101 // BrdReflect101
template <typename D> struct BrdRowReflect101 template <typename D> struct BrdRowReflect101
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {} explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
@ -300,10 +301,10 @@ namespace cv { namespace gpu { namespace device
} }
const int last_col; const int last_col;
}; };
template <typename D> struct BrdColReflect101 template <typename D> struct BrdColReflect101
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {} explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
@ -340,10 +341,10 @@ namespace cv { namespace gpu { namespace device
} }
const int last_row; const int last_row;
}; };
template <typename D> struct BrdReflect101 template <typename D> struct BrdReflect101
{ {
typedef D result_type; typedef D result_type;
__host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {} __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : last_row(height - 1), last_col(width - 1) {}
@ -391,13 +392,13 @@ namespace cv { namespace gpu { namespace device
const int last_row; const int last_row;
const int last_col; const int last_col;
}; };
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// BrdReflect // BrdReflect
template <typename D> struct BrdRowReflect template <typename D> struct BrdRowReflect
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {} explicit __host__ __device__ __forceinline__ BrdRowReflect(int width) : last_col(width - 1) {}
@ -434,10 +435,10 @@ namespace cv { namespace gpu { namespace device
} }
const int last_col; const int last_col;
}; };
template <typename D> struct BrdColReflect template <typename D> struct BrdColReflect
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {} explicit __host__ __device__ __forceinline__ BrdColReflect(int height) : last_row(height - 1) {}
@ -474,10 +475,10 @@ namespace cv { namespace gpu { namespace device
} }
const int last_row; const int last_row;
}; };
template <typename D> struct BrdReflect template <typename D> struct BrdReflect
{ {
typedef D result_type; typedef D result_type;
__host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {} __host__ __device__ __forceinline__ BrdReflect(int height, int width) : last_row(height - 1), last_col(width - 1) {}
@ -525,13 +526,13 @@ namespace cv { namespace gpu { namespace device
const int last_row; const int last_row;
const int last_col; const int last_col;
}; };
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// BrdWrap // BrdWrap
template <typename D> struct BrdRowWrap template <typename D> struct BrdRowWrap
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {} explicit __host__ __device__ __forceinline__ BrdRowWrap(int width_) : width(width_) {}
@ -568,10 +569,10 @@ namespace cv { namespace gpu { namespace device
} }
const int width; const int width;
}; };
template <typename D> struct BrdColWrap template <typename D> struct BrdColWrap
{ {
typedef D result_type; typedef D result_type;
explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {} explicit __host__ __device__ __forceinline__ BrdColWrap(int height_) : height(height_) {}
@ -608,10 +609,10 @@ namespace cv { namespace gpu { namespace device
} }
const int height; const int height;
}; };
template <typename D> struct BrdWrap template <typename D> struct BrdWrap
{ {
typedef D result_type; typedef D result_type;
__host__ __device__ __forceinline__ BrdWrap(int height_, int width_) : __host__ __device__ __forceinline__ BrdWrap(int height_, int width_) :
@ -666,13 +667,13 @@ namespace cv { namespace gpu { namespace device
const int height; const int height;
const int width; const int width;
}; };
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// BorderReader // BorderReader
template <typename Ptr2D, typename B> struct BorderReader template <typename Ptr2D, typename B> struct BorderReader
{ {
typedef typename B::result_type elem_type; typedef typename B::result_type elem_type;
typedef typename Ptr2D::index_type index_type; typedef typename Ptr2D::index_type index_type;
@ -685,12 +686,12 @@ namespace cv { namespace gpu { namespace device
const Ptr2D ptr; const Ptr2D ptr;
const B b; const B b;
}; };
// under win32 there is some bug with templated types that passed as kernel parameters // under win32 there is some bug with templated types that passed as kernel parameters
// with this specialization all works fine // with this specialization all works fine
template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> > template <typename Ptr2D, typename D> struct BorderReader< Ptr2D, BrdConstant<D> >
{ {
typedef typename BrdConstant<D>::result_type elem_type; typedef typename BrdConstant<D>::result_type elem_type;
typedef typename Ptr2D::index_type index_type; typedef typename Ptr2D::index_type index_type;
@ -708,7 +709,8 @@ namespace cv { namespace gpu { namespace device
const int height; const int height;
const int width; const int width;
const D val; const D val;
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__

View File

@ -43,179 +43,181 @@
#ifndef __OPENCV_GPU_COLOR_HPP__ #ifndef __OPENCV_GPU_COLOR_HPP__
#define __OPENCV_GPU_COLOR_HPP__ #define __OPENCV_GPU_COLOR_HPP__
#include "internal_shared.hpp"
#include "detail/color_detail.hpp" #include "detail/color_detail.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
// All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
// template <typename T> class ColorSpace1_to_ColorSpace2_traits
// {
// typedef ... functor_type;
// static __host__ __device__ functor_type create_functor();
// };
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2) // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0) // template <typename T> class ColorSpace1_to_ColorSpace2_traits
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2) // {
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0) // typedef ... functor_type;
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2) // static __host__ __device__ functor_type create_functor();
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2) // };
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
#undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6) OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
#undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6) OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
#undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3) OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4) OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5) OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6) OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5) OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6) OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
#undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
#undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2) OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
#undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0) OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0) OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0) OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0) OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0) OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__

View File

@ -45,6 +45,8 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
#if defined(_WIN64) || defined(__LP64__) #if defined(_WIN64) || defined(__LP64__)
// 64-bit register modifier for inlined asm // 64-bit register modifier for inlined asm
#define OPENCV_GPU_ASM_PTR "l" #define OPENCV_GPU_ASM_PTR "l"
@ -53,8 +55,6 @@
#define OPENCV_GPU_ASM_PTR "r" #define OPENCV_GPU_ASM_PTR "r"
#endif #endif
namespace cv { namespace gpu { namespace device
{
#if __CUDA_ARCH__ >= 200 #if __CUDA_ARCH__ >= 200
// for Fermi memory space is detected automatically // for Fermi memory space is detected automatically
@ -99,6 +99,7 @@ namespace cv { namespace gpu { namespace device
#undef OPENCV_GPU_DEFINE_FORCE_GLOB_B #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
#endif // __CUDA_ARCH__ >= 200 #endif // __CUDA_ARCH__ >= 200
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__ #endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__

View File

@ -43,19 +43,20 @@
#ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__ #ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__
#define __OPENCV_GPU_COLOR_DETAIL_HPP__ #define __OPENCV_GPU_COLOR_DETAIL_HPP__
#include "internal_shared.hpp"
#include "../vec_traits.hpp" #include "../vec_traits.hpp"
#include "../saturate_cast.hpp" #include "../saturate_cast.hpp"
#include "../limits.hpp" #include "../limits.hpp"
#include "../functional.hpp" #include "../functional.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
#ifndef CV_DESCALE #ifndef CV_DESCALE
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
#endif #endif
namespace cv { namespace gpu { namespace device namespace detail
{ {
namespace detail
{
template<typename T> struct ColorChannel template<typename T> struct ColorChannel
{ {
typedef float worktype_f; typedef float worktype_f;
@ -94,12 +95,12 @@ namespace cv { namespace gpu { namespace device
B2Y = 1868, B2Y = 1868,
BLOCK_SIZE = 256 BLOCK_SIZE = 256
}; };
} }
////////////////// Various 3/4-channel to 3/4-channel RGB transformations ///////////////// ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
namespace detail namespace detail
{ {
template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type> template <typename T, int scn, int dcn, int bidx> struct RGB2RGB : unary_function<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type>
{ {
__device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const __device__ typename TypeVec<T, dcn>::vec_type operator()(const typename TypeVec<T, scn>::vec_type& src) const
@ -129,7 +130,7 @@ namespace cv { namespace gpu { namespace device
return dst; return dst;
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -143,8 +144,8 @@ namespace cv { namespace gpu { namespace device
/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB ////////// /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
namespace detail namespace detail
{ {
template <int green_bits, int bidx> struct RGB2RGB5x5Converter; template <int green_bits, int bidx> struct RGB2RGB5x5Converter;
template<int bidx> struct RGB2RGB5x5Converter<6, bidx> template<int bidx> struct RGB2RGB5x5Converter<6, bidx>
{ {
@ -191,7 +192,7 @@ namespace cv { namespace gpu { namespace device
return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src); return RGB2RGB5x5Converter<green_bits, bidx>::cvt(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \ #define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
struct name ## _traits \ struct name ## _traits \
@ -203,8 +204,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
template <int green_bits, int bidx> struct RGB5x52RGBConverter; template <int green_bits, int bidx> struct RGB5x52RGBConverter;
template <int bidx> struct RGB5x52RGBConverter<5, bidx> template <int bidx> struct RGB5x52RGBConverter<5, bidx>
{ {
@ -261,7 +262,7 @@ namespace cv { namespace gpu { namespace device
return dst; return dst;
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \ #define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
struct name ## _traits \ struct name ## _traits \
@ -275,8 +276,8 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////// Grayscale to Color //////////////////////////////// ///////////////////////////////// Grayscale to Color ////////////////////////////////
namespace detail namespace detail
{ {
template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type> template <typename T, int dcn> struct Gray2RGB : unary_function<T, typename TypeVec<T, dcn>::vec_type>
{ {
__device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const __device__ __forceinline__ typename TypeVec<T, dcn>::vec_type operator()(T src) const
@ -302,7 +303,7 @@ namespace cv { namespace gpu { namespace device
return dst; return dst;
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \ #define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -314,8 +315,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
template <int green_bits> struct Gray2RGB5x5Converter; template <int green_bits> struct Gray2RGB5x5Converter;
template<> struct Gray2RGB5x5Converter<6> template<> struct Gray2RGB5x5Converter<6>
{ {
@ -340,7 +341,7 @@ namespace cv { namespace gpu { namespace device
return Gray2RGB5x5Converter<green_bits>::cvt(src); return Gray2RGB5x5Converter<green_bits>::cvt(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \ #define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
struct name ## _traits \ struct name ## _traits \
@ -354,8 +355,8 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////// Color to Grayscale //////////////////////////////// ///////////////////////////////// Color to Grayscale ////////////////////////////////
namespace detail namespace detail
{ {
template <int green_bits> struct RGB5x52GrayConverter; template <int green_bits> struct RGB5x52GrayConverter;
template <> struct RGB5x52GrayConverter<6> template <> struct RGB5x52GrayConverter<6>
{ {
@ -379,7 +380,7 @@ namespace cv { namespace gpu { namespace device
return RGB5x52GrayConverter<green_bits>::cvt(src); return RGB5x52GrayConverter<green_bits>::cvt(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \ #define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
struct name ## _traits \ struct name ## _traits \
@ -391,8 +392,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src) template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)
{ {
return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift); return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
@ -423,7 +424,7 @@ namespace cv { namespace gpu { namespace device
return RGB2GrayConvert<bidx>(src); return RGB2GrayConvert<bidx>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \ #define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -437,8 +438,8 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////////// RGB <-> YUV ////////////////////////////////////// ///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
namespace detail namespace detail
{ {
__constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f }; __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
__constant__ int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 }; __constant__ int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
@ -493,7 +494,7 @@ namespace cv { namespace gpu { namespace device
return RGB2YUVConvert<bidx>(src); return RGB2YUVConvert<bidx>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -505,8 +506,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
__constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
__constant__ int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; __constant__ int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
@ -564,7 +565,7 @@ namespace cv { namespace gpu { namespace device
return YUV2RGBConvert<bidx>(src); return YUV2RGBConvert<bidx>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -578,8 +579,8 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////////// RGB <-> YCrCb ////////////////////////////////////// ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
namespace detail namespace detail
{ {
__constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
__constant__ int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241}; __constant__ int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
@ -634,7 +635,7 @@ namespace cv { namespace gpu { namespace device
return RGB2YCrCbConvert<bidx>(src); return RGB2YCrCbConvert<bidx>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -646,8 +647,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
__constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f}; __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};
__constant__ int c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049}; __constant__ int c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};
@ -705,7 +706,7 @@ namespace cv { namespace gpu { namespace device
return YCrCb2RGBConvert<bidx>(src); return YCrCb2RGBConvert<bidx>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -719,8 +720,8 @@ namespace cv { namespace gpu { namespace device
////////////////////////////////////// RGB <-> XYZ /////////////////////////////////////// ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
namespace detail namespace detail
{ {
__constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f }; __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };
__constant__ int c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 }; __constant__ int c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };
@ -773,7 +774,7 @@ namespace cv { namespace gpu { namespace device
return RGB2XYZConvert<bidx>(src); return RGB2XYZConvert<bidx>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -785,8 +786,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
__constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f }; __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };
__constant__ int c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 }; __constant__ int c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };
@ -840,7 +841,7 @@ namespace cv { namespace gpu { namespace device
return XYZ2RGBConvert<bidx>(src); return XYZ2RGBConvert<bidx>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -854,8 +855,8 @@ namespace cv { namespace gpu { namespace device
////////////////////////////////////// RGB <-> HSV /////////////////////////////////////// ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
namespace detail namespace detail
{ {
__constant__ int c_HsvDivTable [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096}; __constant__ int c_HsvDivTable [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
__constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482}; __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
__constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685}; __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
@ -970,7 +971,7 @@ namespace cv { namespace gpu { namespace device
return RGB2HSVConvert<bidx, hr>(src); return RGB2HSVConvert<bidx, hr>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -1006,8 +1007,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
__constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} }; __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst) template <int bidx, int hr, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)
@ -1096,7 +1097,7 @@ namespace cv { namespace gpu { namespace device
return HSV2RGBConvert<bidx, hr>(src); return HSV2RGBConvert<bidx, hr>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -1134,8 +1135,8 @@ namespace cv { namespace gpu { namespace device
/////////////////////////////////////// RGB <-> HLS //////////////////////////////////////// /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
namespace detail namespace detail
{ {
template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst) template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)
{ {
const float hscale = hr * (1.f / 360.f); const float hscale = hr * (1.f / 360.f);
@ -1221,7 +1222,7 @@ namespace cv { namespace gpu { namespace device
return RGB2HLSConvert<bidx, hr>(src); return RGB2HLSConvert<bidx, hr>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -1257,8 +1258,8 @@ namespace cv { namespace gpu { namespace device
} \ } \
}; };
namespace detail namespace detail
{ {
__constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} }; __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst) template <int bidx, int hr, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)
@ -1353,7 +1354,7 @@ namespace cv { namespace gpu { namespace device
return HLS2RGBConvert<bidx, hr>(src); return HLS2RGBConvert<bidx, hr>(src);
} }
}; };
} }
#define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \ #define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
template <typename T> struct name ## _traits \ template <typename T> struct name ## _traits \
@ -1388,6 +1389,7 @@ namespace cv { namespace gpu { namespace device
return functor_type(); \ return functor_type(); \
} \ } \
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_COLOR_DETAIL_HPP__ #endif // __OPENCV_GPU_COLOR_DETAIL_HPP__

View File

@ -47,10 +47,10 @@
#include "../vec_traits.hpp" #include "../vec_traits.hpp"
#include "../functional.hpp" #include "../functional.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
namespace detail
{ {
namespace detail
{
//! Mask accessor //! Mask accessor
struct MaskReader struct MaskReader
@ -404,7 +404,8 @@ namespace cv { namespace gpu { namespace device
typedef TransformFunctorTraits<BinOp> ft; typedef TransformFunctorTraits<BinOp> ft;
TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream); TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
} }
} }
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__ #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__

View File

@ -43,12 +43,13 @@
#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ #ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ #define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
#include "internal_shared.hpp"
#include "../vec_traits.hpp" #include "../vec_traits.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
namespace detail
{ {
namespace detail
{
template <bool, typename T1, typename T2> struct Select { typedef T1 type; }; template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; }; template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
@ -180,7 +181,8 @@ namespace cv { namespace gpu { namespace device
typedef U& type; typedef U& type;
enum { value = 1 }; enum { value = 1 };
}; };
} }
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ #endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__

View File

@ -43,10 +43,12 @@
#ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__ #ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__
#define __OPENCV_GPU_UTILITY_DETAIL_HPP__ #define __OPENCV_GPU_UTILITY_DETAIL_HPP__
namespace cv { namespace gpu { namespace device #include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
namespace detail
{ {
namespace detail
{
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Reduction // Reduction
@ -835,7 +837,8 @@ namespace cv { namespace gpu { namespace device
} }
} }
}; };
} }
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__ #endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__

View File

@ -43,12 +43,13 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__ #ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#include "internal_shared.hpp"
#include "../datamov_utils.hpp" #include "../datamov_utils.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
namespace detail
{ {
namespace detail
{
template <int THREAD_DIM, int N> struct UnrollVecDiffCached template <int THREAD_DIM, int N> struct UnrollVecDiffCached
{ {
template <typename Dist, typename T1, typename T2> template <typename Dist, typename T1, typename T2>
@ -111,7 +112,8 @@ namespace cv { namespace gpu { namespace device
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist); UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
} }
}; };
} }
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__ #endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__

View File

@ -43,10 +43,12 @@
#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__ #ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__ #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
namespace cv { namespace gpu { namespace device #include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
template<class T> struct DynamicSharedMem
{ {
template<class T> struct DynamicSharedMem
{
__device__ __forceinline__ operator T*() __device__ __forceinline__ operator T*()
{ {
extern __shared__ int __smem[]; extern __shared__ int __smem[];
@ -58,11 +60,11 @@ namespace cv { namespace gpu { namespace device
extern __shared__ int __smem[]; extern __shared__ int __smem[];
return (T*)__smem; return (T*)__smem;
} }
}; };
// specialize for double to avoid unaligned memory access compile errors // specialize for double to avoid unaligned memory access compile errors
template<> struct DynamicSharedMem<double> template<> struct DynamicSharedMem<double>
{ {
__device__ __forceinline__ operator double*() __device__ __forceinline__ operator double*()
{ {
extern __shared__ double __smem_d[]; extern __shared__ double __smem_d[];
@ -74,7 +76,8 @@ namespace cv { namespace gpu { namespace device
extern __shared__ double __smem_d[]; extern __shared__ double __smem_d[];
return (double*)__smem_d; return (double*)__smem_d;
} }
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__ #endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__

View File

@ -43,14 +43,13 @@
#ifndef OPENCV_GPU_EMULATION_HPP_ #ifndef OPENCV_GPU_EMULATION_HPP_
#define OPENCV_GPU_EMULATION_HPP_ #define OPENCV_GPU_EMULATION_HPP_
#include "opencv2/gpu/device/warp_reduce.hpp" #include "internal_shared.hpp"
#include "warp_reduce.hpp"
namespace cv BEGIN_OPENCV_DEVICE_NAMESPACE
struct Emulation
{ {
namespace device
{
struct Emulation
{
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer) static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
{ {
#if __CUDA_ARCH__ >= 200 #if __CUDA_ARCH__ >= 200
@ -62,8 +61,8 @@ namespace cv
return warp_reduce(cta_buffer); return warp_reduce(cta_buffer);
#endif #endif
} }
}; };
}
} END_OPENCV_DEVICE_NAMESPACE
#endif /* OPENCV_GPU_EMULATION_HPP_ */ #endif /* OPENCV_GPU_EMULATION_HPP_ */

View File

@ -43,14 +43,15 @@
#ifndef __OPENCV_GPU_FILTERS_HPP__ #ifndef __OPENCV_GPU_FILTERS_HPP__
#define __OPENCV_GPU_FILTERS_HPP__ #define __OPENCV_GPU_FILTERS_HPP__
#include "internal_shared.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "vec_math.hpp" #include "vec_math.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
template <typename Ptr2D> struct PointFilter
{ {
template <typename Ptr2D> struct PointFilter
{
typedef typename Ptr2D::elem_type elem_type; typedef typename Ptr2D::elem_type elem_type;
typedef float index_type; typedef float index_type;
@ -62,10 +63,10 @@ namespace cv { namespace gpu { namespace device
} }
const Ptr2D src; const Ptr2D src;
}; };
template <typename Ptr2D> struct LinearFilter template <typename Ptr2D> struct LinearFilter
{ {
typedef typename Ptr2D::elem_type elem_type; typedef typename Ptr2D::elem_type elem_type;
typedef float index_type; typedef float index_type;
@ -98,10 +99,10 @@ namespace cv { namespace gpu { namespace device
} }
const Ptr2D src; const Ptr2D src;
}; };
template <typename Ptr2D> struct CubicFilter template <typename Ptr2D> struct CubicFilter
{ {
typedef typename Ptr2D::elem_type elem_type; typedef typename Ptr2D::elem_type elem_type;
typedef float index_type; typedef float index_type;
typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type; typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
@ -129,7 +130,8 @@ namespace cv { namespace gpu { namespace device
} }
const Ptr2D src; const Ptr2D src;
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_FILTERS_HPP__ #endif // __OPENCV_GPU_FILTERS_HPP__

View File

@ -44,17 +44,14 @@
#ifndef __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ #ifndef __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
#define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ #define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
#include<cstdio> #include <cstdio>
#include "internal_shared.hpp"
namespace cv BEGIN_OPENCV_DEVICE_NAMESPACE
template<class Func>
void printFuncAttrib(Func& func)
{ {
namespace gpu
{
namespace device
{
template<class Func>
void printFuncAttrib(Func& func)
{
cudaFuncAttributes attrs; cudaFuncAttributes attrs;
cudaFuncGetAttributes(&attrs, func); cudaFuncGetAttributes(&attrs, func);
@ -70,9 +67,8 @@ namespace cv
printf("binaryVersion = %d\n", attrs.binaryVersion); printf("binaryVersion = %d\n", attrs.binaryVersion);
printf("\n"); printf("\n");
fflush(stdout); fflush(stdout);
}
}
}
} }
END_OPENCV_DEVICE_NAMESPACE
#endif /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */ #endif /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */

View File

@ -49,182 +49,182 @@
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "type_traits.hpp" #include "type_traits.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
// Function Objects
using thrust::unary_function;
using thrust::binary_function;
// Arithmetic Operations
template <typename T> struct plus : binary_function<T, T, T>
{ {
// Function Objects
using thrust::unary_function;
using thrust::binary_function;
// Arithmetic Operations
template <typename T> struct plus : binary_function<T, T, T>
{
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a + b; return a + b;
} }
}; };
template <typename T> struct minus : binary_function<T, T, T> template <typename T> struct minus : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a - b; return a - b;
} }
}; };
template <typename T> struct multiplies : binary_function<T, T, T> template <typename T> struct multiplies : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a * b; return a * b;
} }
}; };
template <typename T> struct divides : binary_function<T, T, T> template <typename T> struct divides : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a / b; return a / b;
} }
}; };
template <typename T> struct modulus : binary_function<T, T, T> template <typename T> struct modulus : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a % b; return a % b;
} }
}; };
template <typename T> struct negate : unary_function<T, T> template <typename T> struct negate : unary_function<T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
{ {
return -a; return -a;
} }
}; };
// Comparison Operations // Comparison Operations
template <typename T> struct equal_to : binary_function<T, T, bool> template <typename T> struct equal_to : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a == b; return a == b;
} }
}; };
template <typename T> struct not_equal_to : binary_function<T, T, bool> template <typename T> struct not_equal_to : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a != b; return a != b;
} }
}; };
template <typename T> struct greater : binary_function<T, T, bool> template <typename T> struct greater : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a > b; return a > b;
} }
}; };
template <typename T> struct less : binary_function<T, T, bool> template <typename T> struct less : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a < b; return a < b;
} }
}; };
template <typename T> struct greater_equal : binary_function<T, T, bool> template <typename T> struct greater_equal : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a >= b; return a >= b;
} }
}; };
template <typename T> struct less_equal : binary_function<T, T, bool> template <typename T> struct less_equal : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a <= b; return a <= b;
} }
}; };
// Logical Operations // Logical Operations
template <typename T> struct logical_and : binary_function<T, T, bool> template <typename T> struct logical_and : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a && b; return a && b;
} }
}; };
template <typename T> struct logical_or : binary_function<T, T, bool> template <typename T> struct logical_or : binary_function<T, T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a || b; return a || b;
} }
}; };
template <typename T> struct logical_not : unary_function<T, bool> template <typename T> struct logical_not : unary_function<T, bool>
{ {
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
{ {
return !a; return !a;
} }
}; };
// Bitwise Operations // Bitwise Operations
template <typename T> struct bit_and : binary_function<T, T, T> template <typename T> struct bit_and : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a & b; return a & b;
} }
}; };
template <typename T> struct bit_or : binary_function<T, T, T> template <typename T> struct bit_or : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a | b; return a | b;
} }
}; };
template <typename T> struct bit_xor : binary_function<T, T, T> template <typename T> struct bit_xor : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
{ {
return a ^ b; return a ^ b;
} }
}; };
template <typename T> struct bit_not : unary_function<T, T> template <typename T> struct bit_not : unary_function<T, T>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
{ {
return ~v; return ~v;
} }
}; };
// Generalized Identity Operations // Generalized Identity Operations
template <typename T> struct identity : unary_function<T, T> template <typename T> struct identity : unary_function<T, T>
{ {
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
{ {
return x; return x;
} }
}; };
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1> template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
{ {
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
{ {
return lhs; return lhs;
} }
}; };
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2> template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
{ {
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
{ {
return rhs; return rhs;
} }
}; };
// Min/Max Operations // Min/Max Operations
@ -234,39 +234,39 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \ __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
}; };
template <typename T> struct maximum : binary_function<T, T, T> template <typename T> struct maximum : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{ {
return lhs < rhs ? rhs : lhs; return lhs < rhs ? rhs : lhs;
} }
}; };
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, max) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, fmax) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, fmax) OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)
template <typename T> struct minimum : binary_function<T, T, T> template <typename T> struct minimum : binary_function<T, T, T>
{ {
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
{ {
return lhs < rhs ? lhs : rhs; return lhs < rhs ? lhs : rhs;
} }
}; };
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, min) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, fmin) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin) OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)
#undef OPENCV_GPU_IMPLEMENT_MINMAX #undef OPENCV_GPU_IMPLEMENT_MINMAX
@ -277,14 +277,14 @@ namespace cv { namespace gpu { namespace device
{ \ { \
__device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \ __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
{ \ { \
return func ## f(v); \ return :: ## func ## f(v); \
} \ } \
}; \ }; \
template <> struct func ## _func<double> : unary_function<double, double> \ template <> struct func ## _func<double> : unary_function<double, double> \
{ \ { \
__device__ __forceinline__ double operator ()(double v) const \ __device__ __forceinline__ double operator ()(double v) const \
{ \ { \
return func(v); \ return :: ## func(v); \
} \ } \
}; };
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \ #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \
@ -292,67 +292,67 @@ namespace cv { namespace gpu { namespace device
{ \ { \
__device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \ __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
{ \ { \
return func ## f(v1, v2); \ return :: ## func ## f(v1, v2); \
} \ } \
}; \ }; \
template <> struct func ## _func<double> : binary_function<double, double, double> \ template <> struct func ## _func<double> : binary_function<double, double, double> \
{ \ { \
__device__ __forceinline__ double operator ()(double v1, double v2) const \ __device__ __forceinline__ double operator ()(double v1, double v2) const \
{ \ { \
return func(v1, v2); \ return :: ## func(v1, v2); \
} \ } \
}; };
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh)
OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh) OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh)
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot) OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot)
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2) OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2)
OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow) OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow)
#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
template<typename T> struct hypot_sqr_func : binary_function<T, T, float> template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
{ {
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
{ {
return src1 * src1 + src2 * src2; return src1 * src1 + src2 * src2;
} }
}; };
// Saturate Cast Functor // Saturate Cast Functor
template <typename T, typename D> struct saturate_cast_func : unary_function<T, D> template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
{ {
__device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
{ {
return saturate_cast<D>(v); return saturate_cast<D>(v);
} }
}; };
// Threshold Functors // Threshold Functors
template <typename T> struct thresh_binary_func : unary_function<T, T> template <typename T> struct thresh_binary_func : unary_function<T, T>
{ {
__host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
@ -362,9 +362,9 @@ namespace cv { namespace gpu { namespace device
const T thresh; const T thresh;
const T maxVal; const T maxVal;
}; };
template <typename T> struct thresh_binary_inv_func : unary_function<T, T> template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
{ {
__host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
@ -374,9 +374,9 @@ namespace cv { namespace gpu { namespace device
const T thresh; const T thresh;
const T maxVal; const T maxVal;
}; };
template <typename T> struct thresh_trunc_func : unary_function<T, T> template <typename T> struct thresh_trunc_func : unary_function<T, T>
{ {
explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
@ -385,9 +385,9 @@ namespace cv { namespace gpu { namespace device
} }
const T thresh; const T thresh;
}; };
template <typename T> struct thresh_to_zero_func : unary_function<T, T> template <typename T> struct thresh_to_zero_func : unary_function<T, T>
{ {
explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
@ -396,9 +396,9 @@ namespace cv { namespace gpu { namespace device
} }
const T thresh; const T thresh;
}; };
template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T> template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
{ {
explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {} explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
@ -407,12 +407,12 @@ namespace cv { namespace gpu { namespace device
} }
const T thresh; const T thresh;
}; };
// Function Object Adaptors // Function Object Adaptors
template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool> template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
{ {
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {} explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
@ -421,14 +421,14 @@ namespace cv { namespace gpu { namespace device
} }
const Predicate pred; const Predicate pred;
}; };
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred) template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
{ {
return unary_negate<Predicate>(pred); return unary_negate<Predicate>(pred);
} }
template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool> template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
{ {
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {} explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
@ -437,14 +437,14 @@ namespace cv { namespace gpu { namespace device
} }
const Predicate pred; const Predicate pred;
}; };
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred) template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
{ {
return binary_negate<BinaryPredicate>(pred); return binary_negate<BinaryPredicate>(pred);
} }
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
{ {
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {} __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
__device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
@ -454,14 +454,14 @@ namespace cv { namespace gpu { namespace device
const Op op; const Op op;
const typename Op::first_argument_type arg1; const typename Op::first_argument_type arg1;
}; };
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x) template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
{ {
return binder1st<Op>(op, typename Op::first_argument_type(x)); return binder1st<Op>(op, typename Op::first_argument_type(x));
} }
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
{ {
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {} __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
__forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
@ -471,16 +471,16 @@ namespace cv { namespace gpu { namespace device
const Op op; const Op op;
const typename Op::second_argument_type arg2; const typename Op::second_argument_type arg2;
}; };
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x) template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
{ {
return binder2nd<Op>(op, typename Op::second_argument_type(x)); return binder2nd<Op>(op, typename Op::second_argument_type(x));
} }
// Functor Traits // Functor Traits
template <typename F> struct IsUnaryFunction template <typename F> struct IsUnaryFunction
{ {
typedef char Yes; typedef char Yes;
struct No {Yes a[2];}; struct No {Yes a[2];};
@ -490,10 +490,10 @@ namespace cv { namespace gpu { namespace device
static F makeF(); static F makeF();
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) }; enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
}; };
template <typename F> struct IsBinaryFunction template <typename F> struct IsBinaryFunction
{ {
typedef char Yes; typedef char Yes;
struct No {Yes a[2];}; struct No {Yes a[2];};
@ -503,10 +503,10 @@ namespace cv { namespace gpu { namespace device
static F makeF(); static F makeF();
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) }; enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
}; };
namespace detail namespace detail
{ {
template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; }; template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; }; template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; }; template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
@ -534,24 +534,28 @@ namespace cv { namespace gpu { namespace device
{ {
enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift }; enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
}; };
} }
template <typename Func> struct DefaultTransformShift template <typename Func> struct DefaultTransformShift
{ {
enum { shift = detail::ShiftDispatcher<Func>::shift }; enum { shift = detail::ShiftDispatcher<Func>::shift };
}; };
template <typename Func> struct DefaultTransformFunctorTraits template <typename Func> struct DefaultTransformFunctorTraits
{ {
enum { simple_block_dim_x = 16 }; enum { simple_block_dim_x = 16 };
enum { simple_block_dim_y = 16 }; enum { simple_block_dim_y = 16 };
enum { smart_block_dim_x = 16 }; enum { smart_block_dim_x = 16 };
enum { smart_block_dim_y = 16 }; enum { smart_block_dim_y = 16 };
enum { smart_shift = DefaultTransformShift<Func>::shift }; enum { smart_shift = DefaultTransformShift<Func>::shift };
}; };
template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {}; template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
}}}
#define DEFINE_TRANSFORM_FUNCTOR_TRAITS(type) \
template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_FUNCTIONAL_HPP__ #endif // __OPENCV_GPU_FUNCTIONAL_HPP__

View File

@ -43,10 +43,12 @@
#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__ #ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
#define __OPENCV_GPU_LIMITS_GPU_HPP__ #define __OPENCV_GPU_LIMITS_GPU_HPP__
namespace cv { namespace gpu { namespace device #include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
template<class T> struct numeric_limits
{ {
template<class T> struct numeric_limits
{
typedef T type; typedef T type;
__device__ __forceinline__ static type min() { return type(); }; __device__ __forceinline__ static type min() { return type(); };
__device__ __forceinline__ static type max() { return type(); }; __device__ __forceinline__ static type max() { return type(); };
@ -57,10 +59,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN() { return type(); } __device__ __forceinline__ static type quiet_NaN() { return type(); }
__device__ __forceinline__ static type signaling_NaN() { return T(); } __device__ __forceinline__ static type signaling_NaN() { return T(); }
static const bool is_signed; static const bool is_signed;
}; };
template<> struct numeric_limits<bool> template<> struct numeric_limits<bool>
{ {
typedef bool type; typedef bool type;
__device__ __forceinline__ static type min() { return false; }; __device__ __forceinline__ static type min() { return false; };
__device__ __forceinline__ static type max() { return true; }; __device__ __forceinline__ static type max() { return true; };
@ -71,10 +73,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = false; static const bool is_signed = false;
}; };
template<> struct numeric_limits<char> template<> struct numeric_limits<char>
{ {
typedef char type; typedef char type;
__device__ __forceinline__ static type min() { return CHAR_MIN; }; __device__ __forceinline__ static type min() { return CHAR_MIN; };
__device__ __forceinline__ static type max() { return CHAR_MAX; }; __device__ __forceinline__ static type max() { return CHAR_MAX; };
@ -85,10 +87,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = (char)-1 == -1; static const bool is_signed = (char)-1 == -1;
}; };
template<> struct numeric_limits<signed char> template<> struct numeric_limits<signed char>
{ {
typedef char type; typedef char type;
__device__ __forceinline__ static type min() { return CHAR_MIN; }; __device__ __forceinline__ static type min() { return CHAR_MIN; };
__device__ __forceinline__ static type max() { return CHAR_MAX; }; __device__ __forceinline__ static type max() { return CHAR_MAX; };
@ -99,10 +101,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = (signed char)-1 == -1; static const bool is_signed = (signed char)-1 == -1;
}; };
template<> struct numeric_limits<unsigned char> template<> struct numeric_limits<unsigned char>
{ {
typedef unsigned char type; typedef unsigned char type;
__device__ __forceinline__ static type min() { return 0; }; __device__ __forceinline__ static type min() { return 0; };
__device__ __forceinline__ static type max() { return UCHAR_MAX; }; __device__ __forceinline__ static type max() { return UCHAR_MAX; };
@ -113,10 +115,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = false; static const bool is_signed = false;
}; };
template<> struct numeric_limits<short> template<> struct numeric_limits<short>
{ {
typedef short type; typedef short type;
__device__ __forceinline__ static type min() { return SHRT_MIN; }; __device__ __forceinline__ static type min() { return SHRT_MIN; };
__device__ __forceinline__ static type max() { return SHRT_MAX; }; __device__ __forceinline__ static type max() { return SHRT_MAX; };
@ -127,10 +129,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = true; static const bool is_signed = true;
}; };
template<> struct numeric_limits<unsigned short> template<> struct numeric_limits<unsigned short>
{ {
typedef unsigned short type; typedef unsigned short type;
__device__ __forceinline__ static type min() { return 0; }; __device__ __forceinline__ static type min() { return 0; };
__device__ __forceinline__ static type max() { return USHRT_MAX; }; __device__ __forceinline__ static type max() { return USHRT_MAX; };
@ -141,10 +143,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = false; static const bool is_signed = false;
}; };
template<> struct numeric_limits<int> template<> struct numeric_limits<int>
{ {
typedef int type; typedef int type;
__device__ __forceinline__ static type min() { return INT_MIN; }; __device__ __forceinline__ static type min() { return INT_MIN; };
__device__ __forceinline__ static type max() { return INT_MAX; }; __device__ __forceinline__ static type max() { return INT_MAX; };
@ -155,11 +157,11 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = true; static const bool is_signed = true;
}; };
template<> struct numeric_limits<unsigned int> template<> struct numeric_limits<unsigned int>
{ {
typedef unsigned int type; typedef unsigned int type;
__device__ __forceinline__ static type min() { return 0; }; __device__ __forceinline__ static type min() { return 0; };
__device__ __forceinline__ static type max() { return UINT_MAX; }; __device__ __forceinline__ static type max() { return UINT_MAX; };
@ -170,10 +172,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = false; static const bool is_signed = false;
}; };
template<> struct numeric_limits<long> template<> struct numeric_limits<long>
{ {
typedef long type; typedef long type;
__device__ __forceinline__ static type min() { return LONG_MIN; }; __device__ __forceinline__ static type min() { return LONG_MIN; };
__device__ __forceinline__ static type max() { return LONG_MAX; }; __device__ __forceinline__ static type max() { return LONG_MAX; };
@ -184,10 +186,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = true; static const bool is_signed = true;
}; };
template<> struct numeric_limits<unsigned long> template<> struct numeric_limits<unsigned long>
{ {
typedef unsigned long type; typedef unsigned long type;
__device__ __forceinline__ static type min() { return 0; }; __device__ __forceinline__ static type min() { return 0; };
__device__ __forceinline__ static type max() { return ULONG_MAX; }; __device__ __forceinline__ static type max() { return ULONG_MAX; };
@ -198,10 +200,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = false; static const bool is_signed = false;
}; };
template<> struct numeric_limits<float> template<> struct numeric_limits<float>
{ {
typedef float type; typedef float type;
__device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; }; __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
__device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; }; __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
@ -212,10 +214,10 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = true; static const bool is_signed = true;
}; };
template<> struct numeric_limits<double> template<> struct numeric_limits<double>
{ {
typedef double type; typedef double type;
__device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; }; __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
__device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; }; __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
@ -226,7 +228,8 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ static type quiet_NaN(); __device__ __forceinline__ static type quiet_NaN();
__device__ __forceinline__ static type signaling_NaN(); __device__ __forceinline__ static type signaling_NaN();
static const bool is_signed = true; static const bool is_signed = true;
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_LIMITS_GPU_HPP__ #endif // __OPENCV_GPU_LIMITS_GPU_HPP__

View File

@ -45,122 +45,173 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
{ {
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); } return (uchar) ::max((int)v, 0);
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); } }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); } template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); } {
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); } return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); } }
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); } template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); } {
return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
{
return (uchar) ::min(v, (uint)UCHAR_MAX);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
{
return saturate_cast<uchar>((uint)v);
}
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v) template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{ return (uchar)max((int)v, 0); } {
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v) int iv = __float2int_rn(v);
{ return (uchar)min((uint)v, (uint)UCHAR_MAX); } return saturate_cast<uchar>(iv);
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v) }
{ return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); } template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v) {
{ return (uchar)min(v, (uint)UCHAR_MAX); } #if __CUDA_ARCH__ >= 130
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v) int iv = __double2int_rn(v);
{ return saturate_cast<uchar>((uint)v); } return saturate_cast<uchar>(iv);
#else
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
{ int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
#else
return saturate_cast<uchar>((float)v); return saturate_cast<uchar>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v) template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
{ return (schar)min((int)v, SCHAR_MAX); } {
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v) return (schar) ::min((int)v, SCHAR_MAX);
{ return (schar)min((uint)v, (uint)SCHAR_MAX); } }
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v) template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
{ {
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? return (schar) ::min((uint)v, (uint)SCHAR_MAX);
v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
} template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v) {
{ return saturate_cast<schar>((int)v); } return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v) }
{ return (schar)min(v, (uint)SCHAR_MAX); } template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
{
return saturate_cast<schar>((int)v);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
{
return (schar) ::min(v, (uint)SCHAR_MAX);
}
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v) template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
{ int iv = __float2int_rn(v); return saturate_cast<schar>(iv); } {
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v) int iv = __float2int_rn(v);
{ return saturate_cast<schar>(iv);
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 }
int iv = __double2int_rn(v); return saturate_cast<schar>(iv); template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
#else {
#if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<schar>(iv);
#else
return saturate_cast<schar>((float)v); return saturate_cast<schar>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
{ return (ushort)max((int)v, 0); } {
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v) return (ushort) ::max((int)v, 0);
{ return (ushort)max((int)v, 0); } }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
{ return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); } {
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v) return (ushort) ::max((int)v, 0);
{ return (ushort)min(v, (uint)USHRT_MAX); } }
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v) template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
{ int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); } {
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v) return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
{ }
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
int iv = __double2int_rn(v); return saturate_cast<ushort>(iv); {
#else return (ushort) ::min(v, (uint)USHRT_MAX);
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
{
int iv = __float2int_rn(v);
return saturate_cast<ushort>(iv);
}
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
{
#if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<ushort>(iv);
#else
return saturate_cast<ushort>((float)v); return saturate_cast<ushort>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v) template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
{ return (short)min((int)v, SHRT_MAX); } {
template<> __device__ __forceinline__ short saturate_cast<short>(int v) return (short) ::min((int)v, SHRT_MAX);
{ }
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? template<> __device__ __forceinline__ short saturate_cast<short>(int v)
v : v > 0 ? SHRT_MAX : SHRT_MIN); {
} return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
template<> __device__ __forceinline__ short saturate_cast<short>(uint v) }
{ return (short)min(v, (uint)SHRT_MAX); } template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
template<> __device__ __forceinline__ short saturate_cast<short>(float v) {
{ int iv = __float2int_rn(v); return saturate_cast<short>(iv); } return (short) ::min(v, (uint)SHRT_MAX);
template<> __device__ __forceinline__ short saturate_cast<short>(double v) }
{ template<> __device__ __forceinline__ short saturate_cast<short>(float v)
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 {
int iv = __double2int_rn(v); return saturate_cast<short>(iv); int iv = __float2int_rn(v);
#else return saturate_cast<short>(iv);
}
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
{
#if __CUDA_ARCH__ >= 130
int iv = __double2int_rn(v);
return saturate_cast<short>(iv);
#else
return saturate_cast<short>((float)v); return saturate_cast<short>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); } template<> __device__ __forceinline__ int saturate_cast<int>(float v)
template<> __device__ __forceinline__ int saturate_cast<int>(double v) {
{ return __float2int_rn(v);
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 }
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
{
#if __CUDA_ARCH__ >= 130
return __double2int_rn(v); return __double2int_rn(v);
#else #else
return saturate_cast<int>((float)v); return saturate_cast<int>((float)v);
#endif #endif
} }
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); } template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) {
{ return __float2uint_rn(v);
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 }
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
{
#if __CUDA_ARCH__ >= 130
return __double2uint_rn(v); return __double2uint_rn(v);
#else #else
return saturate_cast<uint>((float)v); return saturate_cast<uint>((float)v);
#endif #endif
} }
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */ #endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */

View File

@ -43,32 +43,34 @@
#ifndef __OPENCV_GPU_TRANSFORM_HPP__ #ifndef __OPENCV_GPU_TRANSFORM_HPP__
#define __OPENCV_GPU_TRANSFORM_HPP__ #define __OPENCV_GPU_TRANSFORM_HPP__
#include "detail/transform_detail.hpp" #include "internal_shared.hpp"
#include "utility.hpp" #include "utility.hpp"
#include "detail/transform_detail.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
template <typename T, typename D, typename UnOp>
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
{ {
template <typename T, typename D, typename UnOp>
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
{
detail::transform_caller(src, dst, op, WithOutMask(), stream); detail::transform_caller(src, dst, op, WithOutMask(), stream);
} }
template <typename T, typename D, typename UnOp> template <typename T, typename D, typename UnOp>
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0) void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)
{ {
detail::transform_caller(src, dst, op, SingleMask(mask), stream); detail::transform_caller(src, dst, op, SingleMask(mask), stream);
} }
template <typename T1, typename T2, typename D, typename BinOp> template <typename T1, typename T2, typename D, typename BinOp>
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0) void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
{ {
detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream); detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
} }
template <typename T1, typename T2, typename D, typename BinOp> template <typename T1, typename T2, typename D, typename BinOp>
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0) void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)
{ {
detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream); detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
} }
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_TRANSFORM_HPP__ #endif // __OPENCV_GPU_TRANSFORM_HPP__

View File

@ -43,17 +43,18 @@
#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__ #ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
#define __OPENCV_GPU_TYPE_TRAITS_HPP__ #define __OPENCV_GPU_TYPE_TRAITS_HPP__
#include "internal_shared.hpp"
#include "detail/type_traits_detail.hpp" #include "detail/type_traits_detail.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T> struct IsSimpleParameter
{
enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
};
template <typename T> struct TypeTraits template <typename T> struct IsSimpleParameter
{ {
enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
};
template <typename T> struct TypeTraits
{
typedef typename detail::UnConst<T>::type NonConstType; typedef typename detail::UnConst<T>::type NonConstType;
typedef typename detail::UnVolatile<T>::type NonVolatileType; typedef typename detail::UnVolatile<T>::type NonVolatileType;
typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type UnqualifiedType; typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type UnqualifiedType;
@ -74,7 +75,8 @@ namespace cv { namespace gpu { namespace device
enum { isVec = detail::IsVec<UnqualifiedType>::value }; enum { isVec = detail::IsVec<UnqualifiedType>::value };
typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType; typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__ #endif // __OPENCV_GPU_TYPE_TRAITS_HPP__

View File

@ -48,28 +48,28 @@
#include "datamov_utils.hpp" #include "datamov_utils.hpp"
#include "detail/utility_detail.hpp" #include "detail/utility_detail.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
#define OPENCV_GPU_LOG_WARP_SIZE (5) #define OPENCV_GPU_LOG_WARP_SIZE (5)
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE) #define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla #define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS) #define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
namespace cv { namespace gpu { namespace device ///////////////////////////////////////////////////////////////////////////////
{ // swap
///////////////////////////////////////////////////////////////////////////////
// swap
template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
{ {
const T temp = a; const T temp = a;
a = b; a = b;
b = temp; b = temp;
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Mask Reader // Mask Reader
struct SingleMask struct SingleMask
{ {
explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {} explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}
__device__ __forceinline__ bool operator()(int y, int x) const __device__ __forceinline__ bool operator()(int y, int x) const
@ -78,10 +78,10 @@ namespace cv { namespace gpu { namespace device
} }
const PtrStepb mask; const PtrStepb mask;
}; };
struct MaskCollection struct MaskCollection
{ {
explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {} explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}
__device__ __forceinline__ void next() __device__ __forceinline__ void next()
@ -101,10 +101,10 @@ namespace cv { namespace gpu { namespace device
const PtrStepb* maskCollection; const PtrStepb* maskCollection;
PtrStepb curMask; PtrStepb curMask;
}; };
struct WithOutMask struct WithOutMask
{ {
__device__ __forceinline__ void next() const __device__ __forceinline__ void next() const
{ {
} }
@ -116,37 +116,52 @@ namespace cv { namespace gpu { namespace device
{ {
return true; return true;
} }
};
/////////////////////////////////////////////////////////////////////////////// __device__ __forceinline__ bool operator()(int, int, int) const
// Reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{ {
return true;
}
static __device__ __forceinline__ bool check(int, int)
{
return true;
}
static __device__ __forceinline__ bool check(int, int, int)
{
return true;
}
};
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check(); StaticAssert<n >= 8 && n <= 512>::check();
detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op); detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
} }
template <int n, typename T, typename V, typename Pred> template <int n, typename T, typename V, typename Pred>
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred) __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
{ {
StaticAssert<n >= 8 && n <= 512>::check(); StaticAssert<n >= 8 && n <= 512>::check();
detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred); detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
} }
template <int n, typename T, typename V1, typename V2, typename Pred> template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred) __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{ {
StaticAssert<n >= 8 && n <= 512>::check(); StaticAssert<n >= 8 && n <= 512>::check();
detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Solve linear system // Solve linear system
// solve 2x2 linear system Ax=b // solve 2x2 linear system Ax=b
template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2]) template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
{ {
T det = A[0][0] * A[1][1] - A[1][0] * A[0][1]; T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
if (det != 0) if (det != 0)
@ -161,11 +176,11 @@ namespace cv { namespace gpu { namespace device
} }
return false; return false;
} }
// solve 3x3 linear system Ax=b // solve 3x3 linear system Ax=b
template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3]) template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
{ {
T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
- A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+ A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]); + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
@ -193,7 +208,8 @@ namespace cv { namespace gpu { namespace device
} }
return false; return false;
} }
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_UTILITY_HPP__ #endif // __OPENCV_GPU_UTILITY_HPP__

View File

@ -48,11 +48,10 @@
#include "functional.hpp" #include "functional.hpp"
#include "detail/vec_distance_detail.hpp" #include "detail/vec_distance_detail.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template <typename T> struct L1Dist template <typename T> struct L1Dist
{ {
typedef int value_type; typedef int value_type;
typedef int result_type; typedef int result_type;
@ -74,9 +73,9 @@ namespace cv { namespace gpu { namespace device
} }
int mySum; int mySum;
}; };
template <> struct L1Dist<float> template <> struct L1Dist<float>
{ {
typedef float value_type; typedef float value_type;
typedef float result_type; typedef float result_type;
@ -98,10 +97,10 @@ namespace cv { namespace gpu { namespace device
} }
float mySum; float mySum;
}; };
struct L2Dist struct L2Dist
{ {
typedef float value_type; typedef float value_type;
typedef float result_type; typedef float result_type;
@ -124,10 +123,10 @@ namespace cv { namespace gpu { namespace device
} }
float mySum; float mySum;
}; };
struct HammingDist struct HammingDist
{ {
typedef int value_type; typedef int value_type;
typedef int result_type; typedef int result_type;
@ -149,12 +148,12 @@ namespace cv { namespace gpu { namespace device
} }
int mySum; int mySum;
}; };
// calc distance between two vectors in global memory // calc distance between two vectors in global memory
template <int THREAD_DIM, typename Dist, typename T1, typename T2> template <int THREAD_DIM, typename Dist, typename T1, typename T2>
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
{ {
for (int i = tid; i < len; i += THREAD_DIM) for (int i = tid; i < len; i += THREAD_DIM)
{ {
T1 val1; T1 val1;
@ -167,20 +166,20 @@ namespace cv { namespace gpu { namespace device
} }
dist.reduceAll<THREAD_DIM>(smem, tid); dist.reduceAll<THREAD_DIM>(smem, tid);
} }
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2> template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid) __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
{ {
detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid); detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
dist.reduceAll<THREAD_DIM>(smem, tid); dist.reduceAll<THREAD_DIM>(smem, tid);
} }
// calc distance between two vectors in global memory // calc distance between two vectors in global memory
template <int THREAD_DIM, typename T1> struct VecDiffGlobal template <int THREAD_DIM, typename T1> struct VecDiffGlobal
{ {
explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0) explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
{ {
vec1 = vec1_; vec1 = vec1_;
@ -193,11 +192,11 @@ namespace cv { namespace gpu { namespace device
} }
const T1* vec1; const T1* vec1;
}; };
// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
{ {
template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid) template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
{ {
if (glob_tid < len) if (glob_tid < len)
@ -220,7 +219,8 @@ namespace cv { namespace gpu { namespace device
} }
U vec1Vals[MAX_LEN / THREAD_DIM]; U vec1Vals[MAX_LEN / THREAD_DIM];
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_VEC_DISTANCE_HPP__ #endif // __OPENCV_GPU_VEC_DISTANCE_HPP__

View File

@ -48,10 +48,10 @@
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "functional.hpp" #include "functional.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
namespace detail
{ {
namespace detail
{
template <int cn, typename VecD> struct SatCastHelper; template <int cn, typename VecD> struct SatCastHelper;
template <typename VecD> struct SatCastHelper<1, VecD> template <typename VecD> struct SatCastHelper<1, VecD>
{ {
@ -90,43 +90,43 @@ namespace cv { namespace gpu { namespace device
{ {
return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v); return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
} }
} }
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);} template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \ #define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \ __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
@ -150,8 +150,8 @@ namespace cv { namespace gpu { namespace device
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \ return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
} }
namespace detail namespace detail
{ {
template <typename T1, typename T2> struct BinOpTraits template <typename T1, typename T2> struct BinOpTraits
{ {
typedef int argument_type; typedef int argument_type;
@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace device
{ {
typedef double argument_type; typedef double argument_type;
}; };
} }
#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \ #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \ __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
@ -313,19 +313,20 @@ namespace cv { namespace gpu { namespace device
OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \ OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not) OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar) OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char) OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort) OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short) OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int) OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint) OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
OPENCV_GPU_IMPLEMENT_VEC_OP(float) OPENCV_GPU_IMPLEMENT_VEC_OP(float)
OPENCV_GPU_IMPLEMENT_VEC_OP(double) OPENCV_GPU_IMPLEMENT_VEC_OP(double)
#undef OPENCV_GPU_IMPLEMENT_VEC_UNOP #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
#undef OPENCV_GPU_IMPLEMENT_VEC_BINOP #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
#undef OPENCV_GPU_IMPLEMENT_VEC_OP #undef OPENCV_GPU_IMPLEMENT_VEC_OP
#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_VECMATH_HPP__ #endif // __OPENCV_GPU_VECMATH_HPP__

View File

@ -45,82 +45,82 @@
#include "internal_shared.hpp" #include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device BEGIN_OPENCV_DEVICE_NAMESPACE
{
template<typename T, int N> struct TypeVec;
struct __align__(8) uchar8 template<typename T, int N> struct TypeVec;
{
struct __align__(8) uchar8
{
uchar a0, a1, a2, a3, a4, a5, a6, a7; uchar a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7) static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
{ {
uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
struct __align__(8) char8 struct __align__(8) char8
{ {
schar a0, a1, a2, a3, a4, a5, a6, a7; schar a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
{ {
char8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
struct __align__(16) ushort8 struct __align__(16) ushort8
{ {
ushort a0, a1, a2, a3, a4, a5, a6, a7; ushort a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7) static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
{ {
ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
struct __align__(16) short8 struct __align__(16) short8
{ {
short a0, a1, a2, a3, a4, a5, a6, a7; short a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7) static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
{ {
short8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
struct __align__(32) uint8 struct __align__(32) uint8
{ {
uint a0, a1, a2, a3, a4, a5, a6, a7; uint a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7) static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
{ {
uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
struct __align__(32) int8 struct __align__(32) int8
{ {
int a0, a1, a2, a3, a4, a5, a6, a7; int a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7) static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
{ {
int8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
struct __align__(32) float8 struct __align__(32) float8
{ {
float a0, a1, a2, a3, a4, a5, a6, a7; float a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7) static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
{ {
float8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
struct double8 struct double8
{ {
double a0, a1, a2, a3, a4, a5, a6, a7; double a0, a1, a2, a3, a4, a5, a6, a7;
}; };
static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7) static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
{ {
double8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val; return val;
} }
#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \ #define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
template<> struct TypeVec<type, 1> { typedef type vec_type; }; \ template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
@ -134,28 +134,28 @@ namespace cv { namespace gpu { namespace device
template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \ template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; }; template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar) OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(char) OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort) OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(short) OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(int) OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint) OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(float) OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
OPENCV_GPU_IMPLEMENT_TYPE_VEC(double) OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
#undef OPENCV_GPU_IMPLEMENT_TYPE_VEC #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
template<> struct TypeVec<schar, 1> { typedef schar vec_type; }; template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
template<> struct TypeVec<schar, 2> { typedef char2 vec_type; }; template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
template<> struct TypeVec<schar, 3> { typedef char3 vec_type; }; template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
template<> struct TypeVec<schar, 4> { typedef char4 vec_type; }; template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
template<> struct TypeVec<schar, 8> { typedef char8 vec_type; }; template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
template<> struct TypeVec<bool, 1> { typedef uchar vec_type; }; template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; }; template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; }; template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; }; template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; }; template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
template<typename T> struct VecTraits; template<typename T> struct VecTraits;
@ -209,72 +209,73 @@ namespace cv { namespace gpu { namespace device
static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \ static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
}; };
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar) OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort) OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short) OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int) OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint) OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float) OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double) OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
template<> struct VecTraits<char> template<> struct VecTraits<char>
{ {
typedef char elem_type; typedef char elem_type;
enum {cn=1}; enum {cn=1};
static __device__ __host__ __forceinline__ char all(char v) {return v;} static __device__ __host__ __forceinline__ char all(char v) {return v;}
static __device__ __host__ __forceinline__ char make(char x) {return x;} static __device__ __host__ __forceinline__ char make(char x) {return x;}
static __device__ __host__ __forceinline__ char make(const char* x) {return *x;} static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
}; };
template<> struct VecTraits<schar> template<> struct VecTraits<schar>
{ {
typedef schar elem_type; typedef schar elem_type;
enum {cn=1}; enum {cn=1};
static __device__ __host__ __forceinline__ schar all(schar v) {return v;} static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
static __device__ __host__ __forceinline__ schar make(schar x) {return x;} static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;} static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
}; };
template<> struct VecTraits<char1> template<> struct VecTraits<char1>
{ {
typedef schar elem_type; typedef schar elem_type;
enum {cn=1}; enum {cn=1};
static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);} static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);} static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);} static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
}; };
template<> struct VecTraits<char2> template<> struct VecTraits<char2>
{ {
typedef schar elem_type; typedef schar elem_type;
enum {cn=2}; enum {cn=2};
static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);} static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);} static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);} static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
}; };
template<> struct VecTraits<char3> template<> struct VecTraits<char3>
{ {
typedef schar elem_type; typedef schar elem_type;
enum {cn=3}; enum {cn=3};
static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);} static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);} static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);} static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
}; };
template<> struct VecTraits<char4> template<> struct VecTraits<char4>
{ {
typedef schar elem_type; typedef schar elem_type;
enum {cn=4}; enum {cn=4};
static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);} static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);} static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);} static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
}; };
template<> struct VecTraits<char8> template<> struct VecTraits<char8>
{ {
typedef schar elem_type; typedef schar elem_type;
enum {cn=8}; enum {cn=8};
static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);} static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);} static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_VEC_TRAITS_HPP__ #endif // __OPENCV_GPU_VEC_TRAITS_HPP__

View File

@ -40,17 +40,15 @@
// //
//M*/ //M*/
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP_ #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
#define __OPENCV_GPU_DEVICE_WARP_HPP_ #define __OPENCV_GPU_DEVICE_WARP_HPP__
namespace cv #include "internal_shared.hpp"
BEGIN_OPENCV_DEVICE_NAMESPACE
struct Warp
{ {
namespace gpu
{
namespace device
{
struct Warp
{
enum enum
{ {
LOG_WARP_SIZE = 5, LOG_WARP_SIZE = 5,
@ -110,9 +108,8 @@ namespace cv
for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE) for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
*t = value; *t = value;
} }
}; };
}
}
}
#endif /* __OPENCV_GPU_DEVICE_WARP_HPP_ */ END_OPENCV_DEVICE_NAMESPACE
#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */

View File

@ -41,17 +41,16 @@
//M*/ //M*/
#ifndef OPENCV_GPU_WARP_REDUCE_HPP_ #ifndef OPENCV_GPU_WARP_REDUCE_HPP__
#define OPENCV_GPU_WARP_REDUCE_HPP_ #define OPENCV_GPU_WARP_REDUCE_HPP__
#include "internal_shared.hpp"
namespace cv BEGIN_OPENCV_DEVICE_NAMESPACE
template <class T>
__device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )
{ {
namespace device
{
template <class T>
__device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )
{
const unsigned int lane = tid & 31; // index of thread in warp (0..31) const unsigned int lane = tid & 31; // index of thread in warp (0..31)
if (lane < 16) if (lane < 16)
@ -64,10 +63,10 @@ namespace cv
ptr[tid] = partial = partial + ptr[tid + 2]; ptr[tid] = partial = partial + ptr[tid + 2];
ptr[tid] = partial = partial + ptr[tid + 1]; ptr[tid] = partial = partial + ptr[tid + 1];
} }
return ptr[tid - lane];
} return ptr[tid - lane];
}
} }
#endif /* OPENCV_GPU_WARP_REDUCE_HPP_ */ END_OPENCV_DEVICE_NAMESPACE
#endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */

View File

@ -42,6 +42,8 @@
#include "precomp.hpp" #include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
using namespace std; using namespace std;
#if !defined (HAVE_CUDA) #if !defined (HAVE_CUDA)
@ -53,25 +55,25 @@ void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*st
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace split_merge BEGIN_OPENCV_DEVICE_NAMESPACE
namespace split_merge
{ {
extern "C" void merge_caller(const DevMem2Db* src, DevMem2Db& dst, void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
int total_channels, size_t elem_size, void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
const cudaStream_t& stream); }
extern "C" void split_caller(const DevMem2Db& src, DevMem2Db* dst, END_OPENCV_DEVICE_NAMESPACE
int num_channels, size_t elem_size1,
const cudaStream_t& stream);
namespace
{
void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
{ {
using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;
CV_Assert(src); CV_Assert(src);
CV_Assert(n > 0); CV_Assert(n > 0);
bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) &&
DeviceInfo().supports(NATIVE_DOUBLE);
CV_Assert(src[0].depth() != CV_64F || double_ok);
int depth = src[0].depth(); int depth = src[0].depth();
Size size = src[0].size(); Size size = src[0].size();
@ -100,20 +102,15 @@ namespace cv { namespace gpu { namespace split_merge
src_as_devmem[i] = src[i]; src_as_devmem[i] = src[i];
DevMem2Db dst_as_devmem(dst); DevMem2Db dst_as_devmem(dst);
split_merge::merge_caller(src_as_devmem, dst_as_devmem, merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
total_channels, CV_ELEM_SIZE(depth),
stream);
} }
} }
void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
{ {
CV_Assert(dst); using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;
bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) && CV_Assert(dst);
DeviceInfo().supports(NATIVE_DOUBLE);
CV_Assert(src.depth() != CV_64F || double_ok);
int depth = src.depth(); int depth = src.depth();
int num_channels = src.channels(); int num_channels = src.channels();
@ -135,38 +132,31 @@ namespace cv { namespace gpu { namespace split_merge
dst_as_devmem[i] = dst[i]; dst_as_devmem[i] = dst[i];
DevMem2Db src_as_devmem(src); DevMem2Db src_as_devmem(src);
split_merge::split_caller(src_as_devmem, dst_as_devmem, split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
num_channels, src.elemSize1(),
stream);
} }
}
}}}
void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream) void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
{ {
split_merge::merge(src, n, dst, StreamAccessor::getStream(stream)); ::merge(src, n, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream) void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream)
{ {
split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream)); ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream) void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
{ {
split_merge::split(src, dst, StreamAccessor::getStream(stream)); ::split(src, dst, StreamAccessor::getStream(stream));
} }
void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream) void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream)
{ {
dst.resize(src.channels()); dst.resize(src.channels());
if(src.channels() > 0) if(src.channels() > 0)
split_merge::split(src, &dst[0], StreamAccessor::getStream(stream)); ::split(src, &dst[0], StreamAccessor::getStream(stream));
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

View File

@ -55,21 +55,23 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu BEGIN_OPENCV_DEVICE_NAMESPACE
namespace stereobm
{ {
namespace bm void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream);
{ void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
//extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf); void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream); }
extern "C" void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream); END_OPENCV_DEVICE_NAMESPACE
}
}}
const float defaultAvgTexThreshold = 3; const float defaultAvgTexThreshold = 3;
cv::gpu::StereoBM_GPU::StereoBM_GPU() cv::gpu::StereoBM_GPU::StereoBM_GPU()
: preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold) {} : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)
{
}
cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_) cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_)
: preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold) : preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold)
@ -93,8 +95,12 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
return false; return false;
} }
static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream) namespace
{ {
void stereo_bm_gpu_operator( GpuMat& minSSD, GpuMat& leBuf, GpuMat& riBuf, int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
{
using namespace OPENCV_DEVICE_NAMESPACE_ stereobm;
CV_DbgAssert(left.rows == right.rows && left.cols == right.cols); CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
CV_DbgAssert(left.type() == CV_8UC1); CV_DbgAssert(left.type() == CV_8UC1);
CV_DbgAssert(right.type() == CV_8UC1); CV_DbgAssert(right.type() == CV_8UC1);
@ -110,22 +116,23 @@ static void stereo_bm_gpu_operator ( GpuMat& minSSD, GpuMat& leBuf, GpuMat& ri
leBuf.create( left.size(), left.type()); leBuf.create( left.size(), left.type());
riBuf.create(right.size(), right.type()); riBuf.create(right.size(), right.type());
bm::prefilter_xsobel( left, leBuf, 31, stream); prefilter_xsobel( left, leBuf, 31, stream);
bm::prefilter_xsobel(right, riBuf, 31, stream); prefilter_xsobel(right, riBuf, 31, stream);
le_for_bm = leBuf; le_for_bm = leBuf;
ri_for_bm = riBuf; ri_for_bm = riBuf;
} }
bm::stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream); stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);
if (avergeTexThreshold) if (avergeTexThreshold)
bm::postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream); postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);
}
} }
void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream) void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream)
{ {
::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream)); stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

View File

@ -59,7 +59,9 @@ void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace bp BEGIN_OPENCV_DEVICE_NAMESPACE
namespace stereobp
{ {
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump); void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
template<typename T, typename D> template<typename T, typename D>
@ -74,7 +76,11 @@ namespace cv { namespace gpu { namespace bp
template <typename T> template <typename T>
void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
const DevMem2D_<short>& disp, cudaStream_t stream); const DevMem2D_<short>& disp, cudaStream_t stream);
}}} }
END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE_ stereobp;
namespace namespace
{ {
@ -84,7 +90,6 @@ namespace
const float DEFAULT_DISC_SINGLE_JUMP = 1.0f; const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
} }
void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels) void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
{ {
ndisp = width / 4; ndisp = width / 4;
@ -136,8 +141,8 @@ namespace
typedef void (*comp_data_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream); typedef void (*comp_data_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
static const comp_data_t comp_data_callers[2][5] = static const comp_data_t comp_data_callers[2][5] =
{ {
{0, bp::comp_data_gpu<unsigned char, short>, 0, bp::comp_data_gpu<uchar3, short>, bp::comp_data_gpu<uchar4, short>}, {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
{0, bp::comp_data_gpu<unsigned char, float>, 0, bp::comp_data_gpu<uchar3, float>, bp::comp_data_gpu<uchar4, float>} {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
}; };
CV_Assert(left.size() == right.size() && left.type() == right.type()); CV_Assert(left.size() == right.size() && left.type() == right.type());
@ -236,7 +241,7 @@ namespace
} }
} }
bp::load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump); load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
datas.resize(rthis.levels); datas.resize(rthis.levels);
@ -249,8 +254,6 @@ namespace
void calcBP(GpuMat& disp, Stream& stream) void calcBP(GpuMat& disp, Stream& stream)
{ {
using namespace cv::gpu::bp;
typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream); typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
static const data_step_down_t data_step_down_callers[2] = static const data_step_down_t data_step_down_callers[2] =
{ {
@ -354,13 +357,13 @@ namespace
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream) void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
{ {
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
impl(left, right, disp, stream); impl(left, right, disp, stream);
} }
void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream) void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
{ {
::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out); StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
impl(data, disp, stream); impl(data, disp, stream);
} }

View File

@ -57,7 +57,9 @@ void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, Gp
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace csbp BEGIN_OPENCV_DEVICE_NAMESPACE
namespace stereocsbp
{ {
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th, void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp); const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);
@ -84,8 +86,11 @@ namespace cv { namespace gpu { namespace csbp
template<class T> template<class T>
void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step, void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream); const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
}
}}} END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE_ stereocsbp;
namespace namespace
{ {
@ -208,8 +213,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Compute // Compute
csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
if (stream) if (stream)
{ {
@ -248,17 +252,17 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
{ {
if (i == levels - 1) if (i == levels - 1)
{ {
csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(), init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream); step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
} }
else else
{ {
csbp::compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],
left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream); left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);
int new_idx = (cur_idx + 1) & 1; int new_idx = (cur_idx + 1) & 1;
csbp::init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(), init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i], data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],
@ -267,7 +271,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
cur_idx = new_idx; cur_idx = new_idx;
} }
csbp::calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i], data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],
rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream); rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
} }
@ -282,7 +286,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
else else
out.setTo(zero); out.setTo(zero);
csbp::compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(), compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream); data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);
if (disp.type() != CV_16S) if (disp.type() != CV_16S)

View File

@ -63,8 +63,17 @@ void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace surf BEGIN_OPENCV_DEVICE_NAMESPACE
namespace surf
{ {
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
void bindImgTex(DevMem2Db img);
void bindSumTex(DevMem2D_<uint> sum);
void bindMaskSumTex(DevMem2D_<uint> maskSum);
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers); void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter, void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
@ -78,9 +87,11 @@ namespace cv { namespace gpu { namespace surf
void compute_descriptors_gpu(const DevMem2Df& descriptors, void compute_descriptors_gpu(const DevMem2Df& descriptors,
const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures); const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
}}} }
using namespace cv::gpu::surf; END_OPENCV_DEVICE_NAMESPACE
using namespace OPENCV_DEVICE_NAMESPACE_ surf;
namespace namespace
{ {
@ -136,24 +147,18 @@ namespace
counters.create(1, nOctaves + 1, CV_32SC1); counters.create(1, nOctaves + 1, CV_32SC1);
counters.setTo(Scalar::all(0)); counters.setTo(Scalar::all(0));
uploadConstant("cv::gpu::surf::c_max_candidates", maxCandidates); loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, nOctaveLayers, static_cast<float>(hessianThreshold));
uploadConstant("cv::gpu::surf::c_max_features", maxFeatures);
uploadConstant("cv::gpu::surf::c_img_rows", img_rows);
uploadConstant("cv::gpu::surf::c_img_cols", img_cols);
uploadConstant("cv::gpu::surf::c_nOctaveLayers", nOctaveLayers);
uploadConstant("cv::gpu::surf::c_hessianThreshold", static_cast<float>(hessianThreshold));
imgTex.bind("cv::gpu::surf::imgTex", (DevMem2Db)img); bindImgTex(img);
integralBuffered(img, sum, intBuffer); integralBuffered(img, sum, intBuffer);
sumTex.bind("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum); bindSumTex(sum);
if (use_mask) if (use_mask)
{ {
min(mask, 1.0, mask1); min(mask, 1.0, mask1);
integralBuffered(mask1, maskSum, intBuffer); integralBuffered(mask1, maskSum, intBuffer);
bindMaskSumTex(maskSum);
maskSumTex.bind("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);
} }
} }
@ -171,9 +176,7 @@ namespace
const int layer_rows = img_rows >> octave; const int layer_rows = img_rows >> octave;
const int layer_cols = img_cols >> octave; const int layer_cols = img_cols >> octave;
uploadConstant("cv::gpu::surf::c_octave", octave); loadOctaveConstants(octave, layer_rows, layer_cols);
uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows);
uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols);
icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers); icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);
@ -242,8 +245,6 @@ namespace
int maxFeatures; int maxFeatures;
GpuMat counters; GpuMat counters;
TextureBinder imgTex, sumTex, maskSumTex;
}; };
} }
@ -336,7 +337,7 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
{ {
CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == SF_FEATURE_STRIDE); CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == SF_FEATURE_STRIDE);
Mat keypointsCPU = keypointsGPU; Mat keypointsCPU(keypointsGPU);
keypoints.resize(nFeatures); keypoints.resize(nFeatures);

View File

@ -549,8 +549,8 @@ TEST_P(MorphEx, Accuracy)
cv::gpu::GpuMat dev_dst_rgba; cv::gpu::GpuMat dev_dst_rgba;
cv::gpu::GpuMat dev_dst_gray; cv::gpu::GpuMat dev_dst_gray;
cv::gpu::morphologyEx(cv::gpu::GpuMat(img_rgba), dev_dst_rgba, morphOps[morphOpsIdx], cv::gpu::GpuMat(kernel)); cv::gpu::morphologyEx(cv::gpu::GpuMat(img_rgba), dev_dst_rgba, morphOps[morphOpsIdx], kernel);
cv::gpu::morphologyEx(cv::gpu::GpuMat(img_gray), dev_dst_gray, morphOps[morphOpsIdx], cv::gpu::GpuMat(kernel)); cv::gpu::morphologyEx(cv::gpu::GpuMat(img_gray), dev_dst_gray, morphOps[morphOpsIdx], kernel);
dev_dst_rgba.download(dst_rgba); dev_dst_rgba.download(dst_rgba);
dev_dst_gray.download(dst_gray); dev_dst_gray.download(dst_gray);

View File

@ -137,7 +137,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
#ifdef DUMP #ifdef DUMP
dump(block_hists, locations); dump(block_hists, locations);
#else #else
compare(block_hists, locations); compare(cv::Mat(block_hists), locations);
#endif #endif
// Test detect on smaller image // Test detect on smaller image
@ -148,7 +148,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
#ifdef DUMP #ifdef DUMP
dump(block_hists, locations); dump(block_hists, locations);
#else #else
compare(block_hists, locations); compare(cv::Mat(block_hists), locations);
#endif #endif
// Test detect on greater image // Test detect on greater image
@ -158,7 +158,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
#ifdef DUMP #ifdef DUMP
dump(block_hists, locations); dump(block_hists, locations);
#else #else
compare(block_hists, locations); compare(cv::Mat(block_hists), locations);
#endif #endif
} }
@ -254,31 +254,31 @@ struct CV_GpuHogGetDescriptorsTestRunner : cv::gpu::HOGDescriptor
ASSERT_TRUE(!img_rgb.empty()); ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, CV_BGR2BGRA); cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img)); computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(block_hists, descriptors.rowRange(1, 2)); compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));
img_rgb = readImage("hog/negative1.png"); img_rgb = readImage("hog/negative1.png");
ASSERT_TRUE(!img_rgb.empty()); ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, CV_BGR2BGRA); cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img)); computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(block_hists, descriptors.rowRange(2, 3)); compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));
img_rgb = readImage("hog/negative2.png"); img_rgb = readImage("hog/negative2.png");
ASSERT_TRUE(!img_rgb.empty()); ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, CV_BGR2BGRA); cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img)); computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(block_hists, descriptors.rowRange(3, 4)); compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));
img_rgb = readImage("hog/positive3.png"); img_rgb = readImage("hog/positive3.png");
ASSERT_TRUE(!img_rgb.empty()); ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, CV_BGR2BGRA); cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img)); computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(block_hists, descriptors.rowRange(4, 5)); compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));
img_rgb = readImage("hog/negative3.png"); img_rgb = readImage("hog/negative3.png");
ASSERT_TRUE(!img_rgb.empty()); ASSERT_TRUE(!img_rgb.empty());
cv::cvtColor(img_rgb, img, CV_BGR2BGRA); cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
computeBlockHistograms(cv::gpu::GpuMat(img)); computeBlockHistograms(cv::gpu::GpuMat(img));
compare_inner_parts(block_hists, descriptors.rowRange(5, 6)); compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
} }
// Does not compare border value, as interpolation leads to delta // Does not compare border value, as interpolation leads to delta

View File

@ -3897,7 +3897,7 @@ static void testC2C(const std::string& hint, int cols, int rows, int flags, bool
EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr()); EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
ASSERT_EQ(CV_32F, d_b.depth()); ASSERT_EQ(CV_32F, d_b.depth());
ASSERT_EQ(2, d_b.channels()); ASSERT_EQ(2, d_b.channels());
EXPECT_MAT_NEAR(b_gold, d_b, rows * cols * 1e-4); EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
} }
TEST_P(Dft, C2C) TEST_P(Dft, C2C)

View File

@ -206,7 +206,7 @@ void App::run()
workEnd(); workEnd();
// Show results // Show results
disp = d_disp; d_disp.download(disp);
putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255)); putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
imshow("disparity", disp); imshow("disparity", disp);

View File

@ -71,7 +71,7 @@ int main(int argc, char* argv[])
// drawing the results // drawing the results
Mat img_matches; Mat img_matches;
drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches); drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);
namedWindow("matches", 0); namedWindow("matches", 0);
imshow("matches", img_matches); imshow("matches", img_matches);