added registerPageLocked/unregisterPageLocked functions

added convert functions to BruteForceMatcher_GPU
other minor fixes
This commit is contained in:
Vladislav Vinogradov
2011-08-10 11:32:48 +00:00
parent 24279c2c11
commit 8bb9e4302e
18 changed files with 276 additions and 228 deletions

View File

@@ -56,9 +56,8 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// Mask strategy
class SingleMask
struct SingleMask
{
public:
explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
__device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
@@ -66,13 +65,11 @@ namespace cv { namespace gpu { namespace bfmatcher
return mask.ptr(queryIdx)[trainIdx] != 0;
}
private:
PtrStep mask;
const PtrStep mask;
};
class MaskCollection
struct MaskCollection
{
public:
explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
__device__ __forceinline__ void nextMask()
@@ -86,15 +83,14 @@ namespace cv { namespace gpu { namespace bfmatcher
return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0));
}
private:
PtrStep* maskCollection;
const PtrStep* maskCollection;
PtrStep curMask;
};
class WithOutMask
{
public:
__device__ __forceinline__ void nextMask()
__device__ __forceinline__ void nextMask() const
{
}
__device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
@@ -128,9 +124,8 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// Distance
template <typename T> class L1Dist
template <typename T> struct L1Dist
{
public:
typedef int ResultType;
typedef int ValueType;
@@ -151,12 +146,10 @@ namespace cv { namespace gpu { namespace bfmatcher
return mySum;
}
private:
int mySum;
};
template <> class L1Dist<float>
template <> struct L1Dist<float>
{
public:
typedef float ResultType;
typedef float ValueType;
@@ -177,13 +170,11 @@ namespace cv { namespace gpu { namespace bfmatcher
return mySum;
}
private:
float mySum;
};
class L2Dist
struct L2Dist
{
public:
typedef float ResultType;
typedef float ValueType;
@@ -205,13 +196,11 @@ namespace cv { namespace gpu { namespace bfmatcher
return sqrtf(mySum);
}
private:
float mySum;
};
class HammingDist
struct HammingDist
{
public:
typedef int ResultType;
typedef int ValueType;
@@ -232,7 +221,6 @@ namespace cv { namespace gpu { namespace bfmatcher
return mySum;
}
private:
int mySum;
};
@@ -425,10 +413,8 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// ReduceDescCalculator
template <int BLOCK_DIM_X, typename T>
class ReduceDescCalculatorSimple
template <int BLOCK_DIM_X, typename T> struct ReduceDescCalculatorSimple
{
public:
__device__ __forceinline__ void prepare(const T* queryDescs_, int, void*)
{
queryDescs = queryDescs_;
@@ -440,14 +426,12 @@ namespace cv { namespace gpu { namespace bfmatcher
reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);
}
private:
const T* queryDescs;
};
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename T, typename U>
class ReduceDescCalculatorCached
struct ReduceDescCalculatorCached
{
public:
__device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem)
{
loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);
@@ -459,7 +443,6 @@ namespace cv { namespace gpu { namespace bfmatcher
reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);
}
private:
U queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];
};
@@ -497,10 +480,8 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// Train collection loop strategy
template <typename T>
class SingleTrain
template <typename T> struct SingleTrain
{
public:
explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)
{
}
@@ -517,14 +498,11 @@ namespace cv { namespace gpu { namespace bfmatcher
return trainDescs.cols;
}
private:
DevMem2D_<T> trainDescs;
const DevMem2D_<T> trainDescs;
};
template <typename T>
class TrainCollection
template <typename T> struct TrainCollection
{
public:
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
{
@@ -536,7 +514,7 @@ namespace cv { namespace gpu { namespace bfmatcher
{
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
{
DevMem2D_<T> trainDescs = trainCollection[imgIdx];
const DevMem2D_<T> trainDescs = trainCollection[imgIdx];
m.nextMask();
matchDescs<Dist>(queryIdx, imgIdx, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
}
@@ -547,7 +525,6 @@ namespace cv { namespace gpu { namespace bfmatcher
return desclen;
}
private:
const DevMem2D_<T>* trainCollection;
int nImg;
int desclen;
@@ -806,7 +783,7 @@ namespace cv { namespace gpu { namespace bfmatcher
// Calc distance kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)
__global__ void calcDistance(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_, const Mask mask, PtrStepf distance)
{
__shared__ typename Dist::ResultType sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
@@ -989,8 +966,7 @@ namespace cv { namespace gpu { namespace bfmatcher
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel
template <int BLOCK_SIZE>
__global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
__shared__ float sdist[SMEM_SIZE];
@@ -1130,8 +1106,8 @@ namespace cv { namespace gpu { namespace bfmatcher
// Radius Match kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void radiusMatch(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_,
float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
__global__ void radiusMatch(const PtrStep_<T> queryDescs_, const DevMem2D_<T> trainDescs_,
float maxDistance, const Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110

View File

@@ -42,6 +42,7 @@
#include "internal_shared.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
@@ -56,9 +57,9 @@ namespace cv { namespace gpu
__constant__ float3 crot2;
__constant__ float3 ctransl;
struct TransformOp
struct TransformOp : unary_function<float3, float3>
{
__device__ __forceinline__ float3 operator()(float3 p) const
__device__ __forceinline__ float3 operator()(const float3& p) const
{
return make_float3(
crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
@@ -89,9 +90,9 @@ namespace cv { namespace gpu
__constant__ float3 cproj0;
__constant__ float3 cproj1;
struct ProjectOp
struct ProjectOp : unary_function<float3, float3>
{
__device__ __forceinline__ float2 operator()(float3 p) const
__device__ __forceinline__ float2 operator()(const float3& p) const
{
// Rotate and translate in 3D
float3 t = make_float3(

View File

@@ -49,7 +49,7 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace canny
{
__global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
__global__ void calcSobelRowPass(const PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
{
__shared__ int smem[16][18];
@@ -100,7 +100,8 @@ namespace cv { namespace gpu { namespace canny
}
};
template <typename Norm> __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf,
PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
{
__shared__ int sdx[18][16];
__shared__ int sdy[18][16];
@@ -179,7 +180,7 @@ namespace cv { namespace gpu { namespace canny
#define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
__global__ void calcMap(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
{
__shared__ float smem[18][18];

View File

@@ -56,10 +56,9 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////////////////
// Compare
template <typename T1, typename T2>
struct NotEqual
template <typename T1, typename T2> struct NotEqual : binary_function<T1, T2, uchar>
{
__device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2)
__device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2) const
{
return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
}
@@ -467,8 +466,7 @@ namespace cv { namespace gpu { namespace mathfunc
//////////////////////////////////////////////////////////////////////////
// pow
template<typename T, bool Signed = device::numeric_limits<T>::is_signed>
struct PowOp
template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
{
float power;
PowOp(float power_) : power(power_) {}
@@ -479,13 +477,12 @@ namespace cv { namespace gpu { namespace mathfunc
}
};
template<typename T>
struct PowOp<T, true>
template<typename T> struct PowOp<T, true> : unary_function<T, T>
{
float power;
PowOp(float power_) : power(power_) {}
__device__ __forceinline__ float operator()(const T& e)
__device__ __forceinline__ float operator()(const T& e) const
{
T res = saturate_cast<T>(__powf((float)e, power));
@@ -495,13 +492,12 @@ namespace cv { namespace gpu { namespace mathfunc
}
};
template<>
struct PowOp<float>
template<> struct PowOp<float> : unary_function<float, float>
{
float power;
PowOp(float power_) : power(power_) {}
__device__ __forceinline__ float operator()(const float& e)
__device__ __forceinline__ float operator()(const float& e) const
{
return __powf(fabs(e), power);
}

View File

@@ -105,7 +105,7 @@ namespace cv { namespace gpu { namespace histograms
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
}
__global__ void histogram256(PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
__global__ void histogram256(const PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
{
//Per-warp subhistogram storage
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
@@ -189,21 +189,18 @@ namespace cv { namespace gpu { namespace histograms
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void equalizeHist(DevMem2D src, PtrStep dst, const int* lut)
__constant__ int c_lut[256];
__global__ void equalizeHist(const DevMem2D src, PtrStep dst)
{
__shared__ int s_lut[256];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
s_lut[tid] = lut[tid];
__syncthreads();
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < src.cols && y < src.rows)
{
dst.ptr(y)[x] = __float2int_rn(255.0f * s_lut[src.ptr(y)[x]] / (src.cols * src.rows));
const uchar val = src.ptr(y)[x];
const int lut = c_lut[val];
dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
}
}
@@ -212,7 +209,9 @@ namespace cv { namespace gpu { namespace histograms
dim3 block(16, 16);
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
equalizeHist<<<grid, block, 0, stream>>>(src, dst, lut);
cudaSafeCall( cudaMemcpyToSymbol(cv::gpu::histograms::c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
equalizeHist<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)

View File

@@ -49,7 +49,7 @@ using namespace cv::gpu::device;
/////////////////////////////////// Remap ///////////////////////////////////////////////
namespace cv { namespace gpu { namespace imgproc
{
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap(0, cudaFilterModeLinear, cudaAddressModeWrap);
__global__ void remap_1c(const float* mapx, const float* mapy, size_t map_step, uchar* out, size_t out_step, int width, int height)
{
@@ -131,16 +131,12 @@ namespace cv { namespace gpu { namespace imgproc
grid.x = divUp(dst.cols, threads.x);
grid.y = divUp(dst.rows, threads.y);
tex_remap.filterMode = cudaFilterModeLinear;
tex_remap.addressMode[0] = tex_remap.addressMode[1] = cudaAddressModeWrap;
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
TextureBinder tex_remap(&tex_remap, src);
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex_remap) );
}
void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
@@ -151,8 +147,8 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(dst.rows, threads.y);
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}

View File

@@ -77,7 +77,6 @@ namespace cv
// Returns true if the GPU analogue exists, false otherwise.
bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
template<class T> static inline void uploadConstant(const char* name, const T& value)
@@ -117,6 +116,49 @@ namespace cv
cudaSafeCall( cudaUnbindTexture(tex) );
}
class TextureBinder
{
public:
TextureBinder() : tex_(0) {}
template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
{
bind(tex, img);
}
template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
{
bind(tex_name, img);
}
~TextureBinder() { unbind(); }
template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
{
unbind();
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
tex_ = tex;
}
template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
{
const textureReference* tex;
cudaSafeCall( cudaGetTextureReference(&tex, tex_name) );
bind(tex, img);
}
void unbind()
{
if (tex_)
{
cudaUnbindTexture(tex_);
tex_ = 0;
}
}
private:
const textureReference* tex_;
};
class NppStreamHandler
{
public:

View File

@@ -43,6 +43,7 @@
#include "internal_shared.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp"
using namespace cv::gpu::device;
@@ -62,7 +63,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
///////////////////////////////////////////////////////////////////////////
template<typename T>
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -162,7 +163,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
}
template<typename T>
__global__ void set_to_without_mask(T * mat, int cols, int rows, size_t step, int channels)
__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -175,7 +176,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
}
template<typename T>
__global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, size_t step, int channels, size_t step_mask)
__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -237,19 +238,16 @@ namespace cv { namespace gpu { namespace matrix_operations {
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template <typename T, typename D>
class Convertor
template <typename T, typename D> struct Convertor : unary_function<T, D>
{
public:
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
__device__ __forceinline__ D operator()(const T& src)
__device__ __forceinline__ D operator()(const T& src) const
{
return saturate_cast<D>(alpha * src + beta);
}
private:
double alpha, beta;
const double alpha, beta;
};
template<typename T, typename D>

View File

@@ -225,7 +225,7 @@ namespace cv { namespace gpu { namespace surf
};
template <typename Mask>
__global__ void icvFindMaximaInLayer(PtrStepf det, PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
{
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
@@ -346,7 +346,7 @@ namespace cv { namespace gpu { namespace surf
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
__global__ void icvInterpolateKeypoint(PtrStepf det, const int4* maxPosBuffer,
__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
unsigned int* featureCounter)
{