moved SURF_GPU and VIBE to gpunonfree module
This commit is contained in:
@@ -1,137 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
|
||||
|
||||
cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long) { throw_nogpu(); }
|
||||
void cv::gpu::VIBE_GPU::initialize(const GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::VIBE_GPU::operator()(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::VIBE_GPU::release() {}
|
||||
|
||||
#else
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace vibe
|
||||
{
|
||||
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
|
||||
|
||||
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
|
||||
|
||||
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
const int defaultNbSamples = 20;
|
||||
const int defaultReqMatches = 2;
|
||||
const int defaultRadius = 20;
|
||||
const int defaultSubsamplingFactor = 16;
|
||||
}
|
||||
|
||||
cv::gpu::VIBE_GPU::VIBE_GPU(unsigned long rngSeed) :
|
||||
frameSize_(0, 0), rngSeed_(rngSeed)
|
||||
{
|
||||
nbSamples = defaultNbSamples;
|
||||
reqMatches = defaultReqMatches;
|
||||
radius = defaultRadius;
|
||||
subsamplingFactor = defaultSubsamplingFactor;
|
||||
}
|
||||
|
||||
void cv::gpu::VIBE_GPU::initialize(const GpuMat& firstFrame, Stream& s)
|
||||
{
|
||||
using namespace cv::gpu::device::vibe;
|
||||
|
||||
CV_Assert(firstFrame.type() == CV_8UC1 || firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
|
||||
|
||||
cudaStream_t stream = StreamAccessor::getStream(s);
|
||||
|
||||
loadConstants(nbSamples, reqMatches, radius, subsamplingFactor);
|
||||
|
||||
frameSize_ = firstFrame.size();
|
||||
|
||||
if (randStates_.size() != frameSize_)
|
||||
{
|
||||
cv::RNG rng(rngSeed_);
|
||||
cv::Mat h_randStates(frameSize_, CV_8UC4);
|
||||
rng.fill(h_randStates, cv::RNG::UNIFORM, 0, 255);
|
||||
randStates_.upload(h_randStates);
|
||||
}
|
||||
|
||||
int ch = firstFrame.channels();
|
||||
int sample_ch = ch == 1 ? 1 : 4;
|
||||
|
||||
samples_.create(nbSamples * frameSize_.height, frameSize_.width, CV_8UC(sample_ch));
|
||||
|
||||
init_gpu(firstFrame, ch, samples_, randStates_, stream);
|
||||
}
|
||||
|
||||
void cv::gpu::VIBE_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, Stream& s)
|
||||
{
|
||||
using namespace cv::gpu::device::vibe;
|
||||
|
||||
CV_Assert(frame.depth() == CV_8U);
|
||||
|
||||
int ch = frame.channels();
|
||||
int sample_ch = ch == 1 ? 1 : 4;
|
||||
|
||||
if (frame.size() != frameSize_ || sample_ch != samples_.channels())
|
||||
initialize(frame);
|
||||
|
||||
fgmask.create(frameSize_, CV_8UC1);
|
||||
|
||||
update_gpu(frame, ch, fgmask, samples_, randStates_, StreamAccessor::getStream(s));
|
||||
}
|
||||
|
||||
void cv::gpu::VIBE_GPU::release()
|
||||
{
|
||||
frameSize_ = Size(0, 0);
|
||||
|
||||
randStates_.release();
|
||||
|
||||
samples_.release();
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,258 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace vibe
|
||||
{
|
||||
__constant__ int c_nbSamples;
|
||||
__constant__ int c_reqMatches;
|
||||
__constant__ int c_radius;
|
||||
__constant__ int c_subsamplingFactor;
|
||||
|
||||
void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_nbSamples, &nbSamples, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_reqMatches, &reqMatches, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_radius, &radius, sizeof(int)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_subsamplingFactor, &subsamplingFactor, sizeof(int)) );
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint nextRand(uint& state)
|
||||
{
|
||||
const unsigned int CV_RNG_COEFF = 4164903690U;
|
||||
state = state * CV_RNG_COEFF + (state >> 16);
|
||||
return state;
|
||||
}
|
||||
|
||||
__constant__ int c_xoff[9] = {-1, 0, 1, -1, 1, -1, 0, 1, 0};
|
||||
__constant__ int c_yoff[9] = {-1, -1, -1, 0, 0, 1, 1, 1, 0};
|
||||
|
||||
__device__ __forceinline__ int2 chooseRandomNeighbor(int x, int y, uint& randState, int count = 8)
|
||||
{
|
||||
int idx = nextRand(randState) % count;
|
||||
|
||||
return make_int2(x + c_xoff[idx], y + c_yoff[idx]);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uchar cvt(uchar val)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
__device__ __forceinline__ uchar4 cvt(const uchar3& val)
|
||||
{
|
||||
return make_uchar4(val.x, val.y, val.z, 0);
|
||||
}
|
||||
__device__ __forceinline__ uchar4 cvt(const uchar4& val)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
__global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x >= frame.cols || y >= frame.rows)
|
||||
return;
|
||||
|
||||
uint localState = randStates(y, x);
|
||||
|
||||
for (int k = 0; k < c_nbSamples; ++k)
|
||||
{
|
||||
int2 np = chooseRandomNeighbor(x, y, localState, 9);
|
||||
|
||||
np.x = ::max(0, ::min(np.x, frame.cols - 1));
|
||||
np.y = ::max(0, ::min(np.y, frame.rows - 1));
|
||||
|
||||
SrcT pix = frame(np.y, np.x);
|
||||
|
||||
samples(k * frame.rows + y, x) = cvt(pix);
|
||||
}
|
||||
|
||||
randStates(y, x) = localState;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );
|
||||
|
||||
init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
|
||||
};
|
||||
|
||||
funcs[cn](frame, samples, randStates, stream);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int calcDist(uchar a, uchar b)
|
||||
{
|
||||
return ::abs(a - b);
|
||||
}
|
||||
__device__ __forceinline__ int calcDist(const uchar3& a, const uchar4& b)
|
||||
{
|
||||
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
|
||||
}
|
||||
__device__ __forceinline__ int calcDist(const uchar4& a, const uchar4& b)
|
||||
{
|
||||
return (::abs(a.x - b.x) + ::abs(a.y - b.y) + ::abs(a.z - b.z)) / 3;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
__global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
|
||||
{
|
||||
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (x >= frame.cols || y >= frame.rows)
|
||||
return;
|
||||
|
||||
uint localState = randStates(y, x);
|
||||
|
||||
SrcT imgPix = frame(y, x);
|
||||
|
||||
// comparison with the model
|
||||
|
||||
int count = 0;
|
||||
for (int k = 0; (count < c_reqMatches) && (k < c_nbSamples); ++k)
|
||||
{
|
||||
SampleT samplePix = samples(k * frame.rows + y, x);
|
||||
|
||||
int distance = calcDist(imgPix, samplePix);
|
||||
|
||||
if (distance < c_radius)
|
||||
++count;
|
||||
}
|
||||
|
||||
// pixel classification according to reqMatches
|
||||
|
||||
fgmask(y, x) = (uchar) (-(count < c_reqMatches));
|
||||
|
||||
if (count >= c_reqMatches)
|
||||
{
|
||||
// the pixel belongs to the background
|
||||
|
||||
// gets a random number between 0 and subsamplingFactor-1
|
||||
int randomNumber = nextRand(localState) % c_subsamplingFactor;
|
||||
|
||||
// update of the current pixel model
|
||||
if (randomNumber == 0)
|
||||
{
|
||||
// random subsampling
|
||||
|
||||
int k = nextRand(localState) % c_nbSamples;
|
||||
|
||||
samples(k * frame.rows + y, x) = cvt(imgPix);
|
||||
}
|
||||
|
||||
// update of a neighboring pixel model
|
||||
randomNumber = nextRand(localState) % c_subsamplingFactor;
|
||||
|
||||
if (randomNumber == 0)
|
||||
{
|
||||
// random subsampling
|
||||
|
||||
// chooses a neighboring pixel randomly
|
||||
int2 np = chooseRandomNeighbor(x, y, localState);
|
||||
|
||||
np.x = ::max(0, ::min(np.x, frame.cols - 1));
|
||||
np.y = ::max(0, ::min(np.y, frame.rows - 1));
|
||||
|
||||
// chooses the value to be replaced randomly
|
||||
int k = nextRand(localState) % c_nbSamples;
|
||||
|
||||
samples(k * frame.rows + np.y, np.x) = cvt(imgPix);
|
||||
}
|
||||
}
|
||||
|
||||
randStates(y, x) = localState;
|
||||
}
|
||||
|
||||
template <typename SrcT, typename SampleT>
|
||||
void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
|
||||
|
||||
cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );
|
||||
|
||||
update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
|
||||
{
|
||||
typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
|
||||
static const func_t funcs[] =
|
||||
{
|
||||
0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>
|
||||
};
|
||||
|
||||
funcs[cn](frame, fgmask, samples, randStates, stream);
|
||||
}
|
||||
}
|
||||
}}}
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@@ -1,934 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
|
||||
//
|
||||
// The original code was written by Paul Furgale and Chi Hay Tong
|
||||
// and later optimized and prepared for integration into OpenCV by Itseez.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if !defined CUDA_DISABLER
|
||||
|
||||
#include "opencv2/gpu/device/common.hpp"
|
||||
#include "opencv2/gpu/device/limits.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/reduce.hpp"
|
||||
#include "opencv2/gpu/device/utility.hpp"
|
||||
#include "opencv2/gpu/device/functional.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace surf
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Global parameters
|
||||
|
||||
// The maximum number of features (before subpixel interpolation) that memory is reserved for.
|
||||
__constant__ int c_max_candidates;
|
||||
// The maximum number of features that memory is reserved for.
|
||||
__constant__ int c_max_features;
|
||||
// The image size.
|
||||
__constant__ int c_img_rows;
|
||||
__constant__ int c_img_cols;
|
||||
// The number of layers.
|
||||
__constant__ int c_nOctaveLayers;
|
||||
// The hessian threshold.
|
||||
__constant__ float c_hessianThreshold;
|
||||
|
||||
// The current octave.
|
||||
__constant__ int c_octave;
|
||||
// The current layer size.
|
||||
__constant__ int c_layer_rows;
|
||||
__constant__ int c_layer_cols;
|
||||
|
||||
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
|
||||
}
|
||||
|
||||
void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
|
||||
{
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
|
||||
cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Integral image texture
|
||||
|
||||
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
|
||||
|
||||
void bindImgTex(PtrStepSzb img)
|
||||
{
|
||||
bindTexture(&imgTex, img);
|
||||
}
|
||||
|
||||
size_t bindSumTex(PtrStepSz<uint> sum)
|
||||
{
|
||||
size_t offset;
|
||||
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
|
||||
cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
|
||||
return offset / sizeof(uint);
|
||||
}
|
||||
size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
|
||||
{
|
||||
size_t offset;
|
||||
cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
|
||||
cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
|
||||
return offset / sizeof(uint);
|
||||
}
|
||||
|
||||
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
|
||||
{
|
||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
|
||||
typedef double real_t;
|
||||
#else
|
||||
typedef float real_t;
|
||||
#endif
|
||||
|
||||
float ratio = (float)newSize / oldSize;
|
||||
|
||||
real_t d = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < N; ++k)
|
||||
{
|
||||
int dx1 = __float2int_rn(ratio * src[k][0]);
|
||||
int dy1 = __float2int_rn(ratio * src[k][1]);
|
||||
int dx2 = __float2int_rn(ratio * src[k][2]);
|
||||
int dy2 = __float2int_rn(ratio * src[k][3]);
|
||||
|
||||
real_t t = 0;
|
||||
t += tex2D(sumTex, x + dx1, y + dy1);
|
||||
t -= tex2D(sumTex, x + dx1, y + dy2);
|
||||
t -= tex2D(sumTex, x + dx2, y + dy1);
|
||||
t += tex2D(sumTex, x + dx2, y + dy2);
|
||||
|
||||
d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
}
|
||||
|
||||
return (float)d;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Hessian
|
||||
|
||||
__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
|
||||
__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
|
||||
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };
|
||||
|
||||
__host__ __device__ __forceinline__ int calcSize(int octave, int layer)
|
||||
{
|
||||
/* Wavelet size at first layer of first octave. */
|
||||
const int HAAR_SIZE0 = 9;
|
||||
|
||||
/* Wavelet size increment between layers. This should be an even number,
|
||||
such that the wavelet sizes in an octave are either all even or all odd.
|
||||
This ensures that when looking for the neighbours of a sample, the layers
|
||||
above and below are aligned correctly. */
|
||||
const int HAAR_SIZE_INC = 6;
|
||||
|
||||
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
|
||||
}
|
||||
|
||||
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
|
||||
{
|
||||
// Determine the indices
|
||||
const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
|
||||
const int blockIdx_y = blockIdx.y % gridDim_y;
|
||||
const int blockIdx_z = blockIdx.y / gridDim_y;
|
||||
|
||||
const int j = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int i = threadIdx.y + blockIdx_y * blockDim.y;
|
||||
const int layer = blockIdx_z;
|
||||
|
||||
const int size = calcSize(c_octave, layer);
|
||||
|
||||
const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
|
||||
const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
|
||||
|
||||
// Ignore pixels where some of the kernel is outside the image
|
||||
const int margin = (size >> 1) >> c_octave;
|
||||
|
||||
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
|
||||
{
|
||||
const float dx = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
|
||||
const float dy = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
|
||||
const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
|
||||
|
||||
det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
|
||||
trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
|
||||
}
|
||||
}
|
||||
|
||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
||||
int octave, int nOctaveLayers)
|
||||
{
|
||||
const int min_size = calcSize(octave, 0);
|
||||
const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
|
||||
const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
|
||||
|
||||
dim3 threads(16, 16);
|
||||
|
||||
dim3 grid;
|
||||
grid.x = divUp(max_samples_j, threads.x);
|
||||
grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
|
||||
|
||||
icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// NONMAX
|
||||
|
||||
__constant__ float c_DM[5] = {0, 0, 9, 9, 1};
|
||||
|
||||
struct WithMask
|
||||
{
|
||||
static __device__ bool check(int sum_i, int sum_j, int size)
|
||||
{
|
||||
float ratio = (float)size / 9.0f;
|
||||
|
||||
float d = 0;
|
||||
|
||||
int dx1 = __float2int_rn(ratio * c_DM[0]);
|
||||
int dy1 = __float2int_rn(ratio * c_DM[1]);
|
||||
int dx2 = __float2int_rn(ratio * c_DM[2]);
|
||||
int dy2 = __float2int_rn(ratio * c_DM[3]);
|
||||
|
||||
float t = 0;
|
||||
t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
|
||||
t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
|
||||
t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
|
||||
t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
|
||||
|
||||
d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
|
||||
|
||||
return (d >= 0.5f);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Mask>
|
||||
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
|
||||
unsigned int* maxCounter)
|
||||
{
|
||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
||||
|
||||
extern __shared__ float N9[];
|
||||
|
||||
// The hidx variables are the indices to the hessian buffer.
|
||||
const int gridDim_y = gridDim.y / c_nOctaveLayers;
|
||||
const int blockIdx_y = blockIdx.y % gridDim_y;
|
||||
const int blockIdx_z = blockIdx.y / gridDim_y;
|
||||
|
||||
const int layer = blockIdx_z + 1;
|
||||
|
||||
const int size = calcSize(c_octave, layer);
|
||||
|
||||
// Ignore pixels without a 3x3x3 neighbourhood in the layer above
|
||||
const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
|
||||
|
||||
const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
|
||||
const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
|
||||
|
||||
// Is this thread within the hessian buffer?
|
||||
const int zoff = blockDim.x * blockDim.y;
|
||||
const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
|
||||
N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
|
||||
N9[localLin ] = det.ptr(c_layer_rows * (layer ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
|
||||
N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
|
||||
__syncthreads();
|
||||
|
||||
if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
|
||||
{
|
||||
float val0 = N9[localLin];
|
||||
|
||||
if (val0 > c_hessianThreshold)
|
||||
{
|
||||
// Coordinates for the start of the wavelet in the sum image. There
|
||||
// is some integer division involved, so don't try to simplify this
|
||||
// (cancel out sampleStep) without checking the result is the same
|
||||
const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
|
||||
const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
|
||||
|
||||
if (Mask::check(sum_i, sum_j, size))
|
||||
{
|
||||
// Check to see if we have a max (in its 26 neighbours)
|
||||
const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
|
||||
&& val0 > N9[localLin - blockDim.x - zoff]
|
||||
&& val0 > N9[localLin + 1 - blockDim.x - zoff]
|
||||
&& val0 > N9[localLin - 1 - zoff]
|
||||
&& val0 > N9[localLin - zoff]
|
||||
&& val0 > N9[localLin + 1 - zoff]
|
||||
&& val0 > N9[localLin - 1 + blockDim.x - zoff]
|
||||
&& val0 > N9[localLin + blockDim.x - zoff]
|
||||
&& val0 > N9[localLin + 1 + blockDim.x - zoff]
|
||||
|
||||
&& val0 > N9[localLin - 1 - blockDim.x]
|
||||
&& val0 > N9[localLin - blockDim.x]
|
||||
&& val0 > N9[localLin + 1 - blockDim.x]
|
||||
&& val0 > N9[localLin - 1 ]
|
||||
&& val0 > N9[localLin + 1 ]
|
||||
&& val0 > N9[localLin - 1 + blockDim.x]
|
||||
&& val0 > N9[localLin + blockDim.x]
|
||||
&& val0 > N9[localLin + 1 + blockDim.x]
|
||||
|
||||
&& val0 > N9[localLin - 1 - blockDim.x + zoff]
|
||||
&& val0 > N9[localLin - blockDim.x + zoff]
|
||||
&& val0 > N9[localLin + 1 - blockDim.x + zoff]
|
||||
&& val0 > N9[localLin - 1 + zoff]
|
||||
&& val0 > N9[localLin + zoff]
|
||||
&& val0 > N9[localLin + 1 + zoff]
|
||||
&& val0 > N9[localLin - 1 + blockDim.x + zoff]
|
||||
&& val0 > N9[localLin + blockDim.x + zoff]
|
||||
&& val0 > N9[localLin + 1 + blockDim.x + zoff]
|
||||
;
|
||||
|
||||
if(condmax)
|
||||
{
|
||||
unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
|
||||
|
||||
if (ind < c_max_candidates)
|
||||
{
|
||||
const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
|
||||
|
||||
maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||
int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
|
||||
{
|
||||
const int layer_rows = img_rows >> octave;
|
||||
const int layer_cols = img_cols >> octave;
|
||||
|
||||
const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
|
||||
|
||||
dim3 threads(16, 16);
|
||||
|
||||
dim3 grid;
|
||||
grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
|
||||
grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
|
||||
|
||||
const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
|
||||
|
||||
if (use_mask)
|
||||
icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
|
||||
else
|
||||
icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// INTERPOLATION
|
||||
|
||||
__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
|
||||
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter)
|
||||
{
|
||||
#if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
|
||||
|
||||
const int4 maxPos = maxPosBuffer[blockIdx.x];
|
||||
|
||||
const int j = maxPos.x - 1 + threadIdx.x;
|
||||
const int i = maxPos.y - 1 + threadIdx.y;
|
||||
const int layer = maxPos.z - 1 + threadIdx.z;
|
||||
|
||||
__shared__ float N9[3][3][3];
|
||||
|
||||
N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
|
||||
{
|
||||
__shared__ float dD[3];
|
||||
|
||||
//dx
|
||||
dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
|
||||
//dy
|
||||
dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
|
||||
//ds
|
||||
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
|
||||
|
||||
__shared__ float H[3][3];
|
||||
|
||||
//dxx
|
||||
H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
|
||||
//dxy
|
||||
H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
|
||||
//dxs
|
||||
H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
|
||||
//dyx = dxy
|
||||
H[1][0] = H[0][1];
|
||||
//dyy
|
||||
H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
|
||||
//dys
|
||||
H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
|
||||
//dsx = dxs
|
||||
H[2][0] = H[0][2];
|
||||
//dsy = dys
|
||||
H[2][1] = H[1][2];
|
||||
//dss
|
||||
H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
|
||||
|
||||
__shared__ float x[3];
|
||||
|
||||
if (solve3x3(H, dD, x))
|
||||
{
|
||||
if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
|
||||
{
|
||||
// if the step is within the interpolation region, perform it
|
||||
|
||||
const int size = calcSize(c_octave, maxPos.z);
|
||||
|
||||
const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
|
||||
const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
|
||||
|
||||
const float center_i = sum_i + (float)(size - 1) / 2;
|
||||
const float center_j = sum_j + (float)(size - 1) / 2;
|
||||
|
||||
const float px = center_j + x[0] * (1 << c_octave);
|
||||
const float py = center_i + x[1] * (1 << c_octave);
|
||||
|
||||
const int ds = size - calcSize(c_octave, maxPos.z - 1);
|
||||
const float psize = roundf(size + x[2] * ds);
|
||||
|
||||
/* The sampling intervals and wavelet sized for selecting an orientation
|
||||
and building the keypoint descriptor are defined relative to 's' */
|
||||
const float s = psize * 1.2f / 9.0f;
|
||||
|
||||
/* To find the dominant orientation, the gradients in x and y are
|
||||
sampled in a circle of radius 6s using wavelets of size 4s.
|
||||
We ensure the gradient wavelet size is even to ensure the
|
||||
wavelet pattern is balanced and symmetric around its center */
|
||||
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
|
||||
|
||||
// check when grad_wav_size is too big
|
||||
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
|
||||
{
|
||||
// Get a new feature index.
|
||||
unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
|
||||
|
||||
if (ind < c_max_features)
|
||||
{
|
||||
featureX[ind] = px;
|
||||
featureY[ind] = py;
|
||||
featureLaplacian[ind] = maxPos.w;
|
||||
featureOctave[ind] = c_octave;
|
||||
featureSize[ind] = psize;
|
||||
featureHessian[ind] = N9[1][1][1];
|
||||
}
|
||||
} // grad_wav_size check
|
||||
} // If the subpixel interpolation worked
|
||||
}
|
||||
} // If this is thread 0.
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter)
|
||||
{
|
||||
dim3 threads;
|
||||
threads.x = 3;
|
||||
threads.y = 3;
|
||||
threads.z = 3;
|
||||
|
||||
dim3 grid;
|
||||
grid.x = maxCounter;
|
||||
|
||||
icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureOctave, featureSize, featureHessian, featureCounter);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Orientation
|
||||
|
||||
#define ORI_SEARCH_INC 5
|
||||
#define ORI_WIN 60
|
||||
#define ORI_SAMPLES 113
|
||||
|
||||
__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
|
||||
__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
|
||||
__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
|
||||
|
||||
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
|
||||
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
|
||||
|
||||
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
|
||||
{
|
||||
__shared__ float s_X[128];
|
||||
__shared__ float s_Y[128];
|
||||
__shared__ float s_angle[128];
|
||||
|
||||
__shared__ float s_sumx[32 * 4];
|
||||
__shared__ float s_sumy[32 * 4];
|
||||
|
||||
/* The sampling intervals and wavelet sized for selecting an orientation
|
||||
and building the keypoint descriptor are defined relative to 's' */
|
||||
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
|
||||
|
||||
/* To find the dominant orientation, the gradients in x and y are
|
||||
sampled in a circle of radius 6s using wavelets of size 4s.
|
||||
We ensure the gradient wavelet size is even to ensure the
|
||||
wavelet pattern is balanced and symmetric around its center */
|
||||
const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
|
||||
|
||||
// check when grad_wav_size is too big
|
||||
if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
|
||||
return;
|
||||
|
||||
// Calc X, Y, angle and store it to shared memory
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
float X = 0.0f, Y = 0.0f, angle = 0.0f;
|
||||
|
||||
if (tid < ORI_SAMPLES)
|
||||
{
|
||||
const float margin = (float)(grad_wav_size - 1) / 2.0f;
|
||||
const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
|
||||
const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
|
||||
|
||||
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
|
||||
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
|
||||
{
|
||||
X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
|
||||
Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
|
||||
|
||||
angle = atan2f(Y, X);
|
||||
if (angle < 0)
|
||||
angle += 2.0f * CV_PI_F;
|
||||
angle *= 180.0f / CV_PI_F;
|
||||
}
|
||||
}
|
||||
s_X[tid] = X;
|
||||
s_Y[tid] = Y;
|
||||
s_angle[tid] = angle;
|
||||
__syncthreads();
|
||||
|
||||
float bestx = 0, besty = 0, best_mod = 0;
|
||||
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
#pragma unroll
|
||||
#endif
|
||||
for (int i = 0; i < 18; ++i)
|
||||
{
|
||||
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
|
||||
|
||||
float sumx = 0.0f, sumy = 0.0f;
|
||||
int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx = s_X[threadIdx.x];
|
||||
sumy = s_Y[threadIdx.x];
|
||||
}
|
||||
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx += s_X[threadIdx.x + 32];
|
||||
sumy += s_Y[threadIdx.x + 32];
|
||||
}
|
||||
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx += s_X[threadIdx.x + 64];
|
||||
sumy += s_Y[threadIdx.x + 64];
|
||||
}
|
||||
d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
|
||||
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
|
||||
{
|
||||
sumx += s_X[threadIdx.x + 96];
|
||||
sumy += s_Y[threadIdx.x + 96];
|
||||
}
|
||||
|
||||
plus<float> op;
|
||||
device::reduce<32>(smem_tuple(s_sumx + threadIdx.y * 32, s_sumy + threadIdx.y * 32),
|
||||
thrust::tie(sumx, sumy), threadIdx.x, thrust::make_tuple(op, op));
|
||||
|
||||
const float temp_mod = sumx * sumx + sumy * sumy;
|
||||
if (temp_mod > best_mod)
|
||||
{
|
||||
best_mod = temp_mod;
|
||||
bestx = sumx;
|
||||
besty = sumy;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
s_X[threadIdx.y] = bestx;
|
||||
s_Y[threadIdx.y] = besty;
|
||||
s_angle[threadIdx.y] = best_mod;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
{
|
||||
int bestIdx = 0;
|
||||
|
||||
if (s_angle[1] > s_angle[bestIdx])
|
||||
bestIdx = 1;
|
||||
if (s_angle[2] > s_angle[bestIdx])
|
||||
bestIdx = 2;
|
||||
if (s_angle[3] > s_angle[bestIdx])
|
||||
bestIdx = 3;
|
||||
|
||||
float kp_dir = atan2f(s_Y[bestIdx], s_X[bestIdx]);
|
||||
if (kp_dir < 0)
|
||||
kp_dir += 2.0f * CV_PI_F;
|
||||
kp_dir *= 180.0f / CV_PI_F;
|
||||
|
||||
kp_dir = 360.0f - kp_dir;
|
||||
if (::fabsf(kp_dir - 360.f) < numeric_limits<float>::epsilon())
|
||||
kp_dir = 0.f;
|
||||
|
||||
featureDir[blockIdx.x] = kp_dir;
|
||||
}
|
||||
}
|
||||
|
||||
#undef ORI_SEARCH_INC
|
||||
#undef ORI_WIN
|
||||
#undef ORI_SAMPLES
|
||||
|
||||
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
|
||||
{
|
||||
dim3 threads;
|
||||
threads.x = 32;
|
||||
threads.y = 4;
|
||||
|
||||
dim3 grid;
|
||||
grid.x = nFeatures;
|
||||
|
||||
icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Descriptors
|
||||
|
||||
#define PATCH_SZ 20
|
||||
|
||||
__constant__ float c_DW[PATCH_SZ * PATCH_SZ] =
|
||||
{
|
||||
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f,
|
||||
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
|
||||
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
|
||||
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
|
||||
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
|
||||
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
|
||||
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
|
||||
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
|
||||
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
|
||||
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
|
||||
0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f,
|
||||
0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f,
|
||||
0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f,
|
||||
0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f,
|
||||
9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f,
|
||||
5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f,
|
||||
3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f,
|
||||
1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f,
|
||||
8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f,
|
||||
3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
|
||||
};
|
||||
|
||||
struct WinReader
|
||||
{
|
||||
typedef uchar elem_type;
|
||||
|
||||
__device__ __forceinline__ uchar operator ()(int i, int j) const
|
||||
{
|
||||
float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
|
||||
float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
|
||||
|
||||
return tex2D(imgTex, pixel_x, pixel_y);
|
||||
}
|
||||
|
||||
float centerX;
|
||||
float centerY;
|
||||
float win_offset;
|
||||
float cos_dir;
|
||||
float sin_dir;
|
||||
int width;
|
||||
int height;
|
||||
};
|
||||
|
||||
__device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
|
||||
float& dx, float& dy)
|
||||
{
|
||||
__shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];
|
||||
|
||||
dx = dy = 0.0f;
|
||||
|
||||
WinReader win;
|
||||
|
||||
win.centerX = featureX[blockIdx.x];
|
||||
win.centerY = featureY[blockIdx.x];
|
||||
|
||||
// The sampling intervals and wavelet sized for selecting an orientation
|
||||
// and building the keypoint descriptor are defined relative to 's'
|
||||
const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
|
||||
|
||||
// Extract a window of pixels around the keypoint of size 20s
|
||||
const int win_size = (int)((PATCH_SZ + 1) * s);
|
||||
|
||||
win.width = win.height = win_size;
|
||||
|
||||
// Nearest neighbour version (faster)
|
||||
win.win_offset = -(win_size - 1.0f) / 2.0f;
|
||||
|
||||
float descriptor_dir = 360.0f - featureDir[blockIdx.x];
|
||||
if (::fabsf(descriptor_dir - 360.f) < numeric_limits<float>::epsilon())
|
||||
descriptor_dir = 0.f;
|
||||
descriptor_dir *= CV_PI_F / 180.0f;
|
||||
sincosf(descriptor_dir, &win.sin_dir, &win.cos_dir);
|
||||
|
||||
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
const int xLoadInd = tid % (PATCH_SZ + 1);
|
||||
const int yLoadInd = tid / (PATCH_SZ + 1);
|
||||
|
||||
if (yLoadInd < (PATCH_SZ + 1))
|
||||
{
|
||||
if (s > 1)
|
||||
{
|
||||
AreaFilter<WinReader> filter(win, s, s);
|
||||
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd, xLoadInd);
|
||||
}
|
||||
else
|
||||
{
|
||||
LinearFilter<WinReader> filter(win);
|
||||
s_PATCH[yLoadInd][xLoadInd] = filter(yLoadInd * s, xLoadInd * s);
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const int xPatchInd = threadIdx.x % 5;
|
||||
const int yPatchInd = threadIdx.x / 5;
|
||||
|
||||
if (yPatchInd < 5)
|
||||
{
|
||||
const int xBlockInd = threadIdx.y % 4;
|
||||
const int yBlockInd = threadIdx.y / 4;
|
||||
|
||||
const int xInd = xBlockInd * 5 + xPatchInd;
|
||||
const int yInd = yBlockInd * 5 + yPatchInd;
|
||||
|
||||
const float dw = c_DW[yInd * PATCH_SZ + xInd];
|
||||
|
||||
dx = (s_PATCH[yInd ][xInd + 1] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd + 1][xInd ]) * dw;
|
||||
dy = (s_PATCH[yInd + 1][xInd ] - s_PATCH[yInd][xInd] + s_PATCH[yInd + 1][xInd + 1] - s_PATCH[yInd ][xInd + 1]) * dw;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
{
|
||||
__shared__ float smem[32 * 16];
|
||||
|
||||
float* sRow = smem + threadIdx.y * 32;
|
||||
|
||||
float dx, dy;
|
||||
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
|
||||
|
||||
float dxabs = ::fabsf(dx);
|
||||
float dyabs = ::fabsf(dy);
|
||||
|
||||
plus<float> op;
|
||||
|
||||
reduce<32>(sRow, dx, threadIdx.x, op);
|
||||
reduce<32>(sRow, dy, threadIdx.x, op);
|
||||
reduce<32>(sRow, dxabs, threadIdx.x, op);
|
||||
reduce<32>(sRow, dyabs, threadIdx.x, op);
|
||||
|
||||
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y;
|
||||
|
||||
// write dx, dy, |dx|, |dy|
|
||||
if (threadIdx.x == 0)
|
||||
*descriptors_block = make_float4(dx, dy, dxabs, dyabs);
|
||||
}
|
||||
|
||||
__global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
|
||||
{
|
||||
__shared__ float smem[32 * 16];
|
||||
|
||||
float* sRow = smem + threadIdx.y * 32;
|
||||
|
||||
float dx, dy;
|
||||
calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
|
||||
|
||||
float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;
|
||||
|
||||
plus<float> op;
|
||||
|
||||
float d1 = 0.0f;
|
||||
float d2 = 0.0f;
|
||||
float abs1 = 0.0f;
|
||||
float abs2 = 0.0f;
|
||||
|
||||
if (dy >= 0)
|
||||
{
|
||||
d1 = dx;
|
||||
abs1 = ::fabsf(dx);
|
||||
}
|
||||
else
|
||||
{
|
||||
d2 = dx;
|
||||
abs2 = ::fabsf(dx);
|
||||
}
|
||||
|
||||
reduce<32>(sRow, d1, threadIdx.x, op);
|
||||
reduce<32>(sRow, d2, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs1, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs2, threadIdx.x, op);
|
||||
|
||||
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
|
||||
if (threadIdx.x == 0)
|
||||
descriptors_block[0] = make_float4(d1, abs1, d2, abs2);
|
||||
|
||||
if (dx >= 0)
|
||||
{
|
||||
d1 = dy;
|
||||
abs1 = ::fabsf(dy);
|
||||
d2 = 0.0f;
|
||||
abs2 = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
d1 = 0.0f;
|
||||
abs1 = 0.0f;
|
||||
d2 = dy;
|
||||
abs2 = ::fabsf(dy);
|
||||
}
|
||||
|
||||
reduce<32>(sRow, d1, threadIdx.x, op);
|
||||
reduce<32>(sRow, d2, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs1, threadIdx.x, op);
|
||||
reduce<32>(sRow, abs2, threadIdx.x, op);
|
||||
|
||||
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
|
||||
if (threadIdx.x == 0)
|
||||
descriptors_block[1] = make_float4(d1, abs1, d2, abs2);
|
||||
}
|
||||
|
||||
template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
|
||||
{
|
||||
__shared__ float smem[BLOCK_DIM_X];
|
||||
__shared__ float s_len;
|
||||
|
||||
// no need for thread ID
|
||||
float* descriptor_base = descriptors.ptr(blockIdx.x);
|
||||
|
||||
// read in the unnormalized descriptor values (squared)
|
||||
const float val = descriptor_base[threadIdx.x];
|
||||
|
||||
float len = val * val;
|
||||
reduce<BLOCK_DIM_X>(smem, len, threadIdx.x, plus<float>());
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
s_len = ::sqrtf(len);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// normalize and store in output
|
||||
descriptor_base[threadIdx.x] = val / s_len;
|
||||
}
|
||||
|
||||
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
|
||||
{
|
||||
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
|
||||
|
||||
if (descriptors.cols == 64)
|
||||
{
|
||||
compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<64><<<nFeatures, 64>>>((PtrStepSzf) descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
|
||||
normalize_descriptors<128><<<nFeatures, 128>>>((PtrStepSzf) descriptors);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}
|
||||
} // namespace surf
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
||||
#endif /* CUDA_DISABLER */
|
@@ -1,419 +0,0 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other GpuMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp"
|
||||
|
||||
using namespace cv;
|
||||
using namespace cv::gpu;
|
||||
using namespace std;
|
||||
|
||||
#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
|
||||
|
||||
cv::gpu::SURF_GPU::SURF_GPU() { throw_nogpu(); }
|
||||
cv::gpu::SURF_GPU::SURF_GPU(double, int, int, bool, float, bool) { throw_nogpu(); }
|
||||
int cv::gpu::SURF_GPU::descriptorSize() const { throw_nogpu(); return 0;}
|
||||
void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>&, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat&, vector<float>&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, GpuMat&, bool) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat&, const GpuMat&, vector<KeyPoint>&, vector<float>&, bool) { throw_nogpu(); }
|
||||
void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }
|
||||
|
||||
#else /* !defined (HAVE_CUDA) */
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace surf
|
||||
{
|
||||
void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
|
||||
void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
|
||||
|
||||
void bindImgTex(PtrStepSzb img);
|
||||
size_t bindSumTex(PtrStepSz<unsigned int> sum);
|
||||
size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
|
||||
|
||||
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
|
||||
int octave, int nOctaveLayer);
|
||||
|
||||
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
|
||||
int img_rows, int img_cols, int octave, bool use_mask, int nLayers);
|
||||
|
||||
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
|
||||
float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
|
||||
unsigned int* featureCounter);
|
||||
|
||||
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
|
||||
|
||||
void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
|
||||
}
|
||||
}}}
|
||||
|
||||
using namespace ::cv::gpu::device::surf;
|
||||
|
||||
namespace
|
||||
{
|
||||
int calcSize(int octave, int layer)
|
||||
{
|
||||
/* Wavelet size at first layer of first octave. */
|
||||
const int HAAR_SIZE0 = 9;
|
||||
|
||||
/* Wavelet size increment between layers. This should be an even number,
|
||||
such that the wavelet sizes in an octave are either all even or all odd.
|
||||
This ensures that when looking for the neighbours of a sample, the layers
|
||||
|
||||
above and below are aligned correctly. */
|
||||
const int HAAR_SIZE_INC = 6;
|
||||
|
||||
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
|
||||
}
|
||||
|
||||
class SURF_GPU_Invoker
|
||||
{
|
||||
public:
|
||||
SURF_GPU_Invoker(SURF_GPU& surf, const GpuMat& img, const GpuMat& mask) :
|
||||
surf_(surf),
|
||||
img_cols(img.cols), img_rows(img.rows),
|
||||
use_mask(!mask.empty())
|
||||
{
|
||||
CV_Assert(!img.empty() && img.type() == CV_8UC1);
|
||||
CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
|
||||
CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
|
||||
|
||||
const int min_size = calcSize(surf_.nOctaves - 1, 0);
|
||||
CV_Assert(img_rows - min_size >= 0);
|
||||
CV_Assert(img_cols - min_size >= 0);
|
||||
|
||||
const int layer_rows = img_rows >> (surf_.nOctaves - 1);
|
||||
const int layer_cols = img_cols >> (surf_.nOctaves - 1);
|
||||
const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
|
||||
CV_Assert(layer_rows - 2 * min_margin > 0);
|
||||
CV_Assert(layer_cols - 2 * min_margin > 0);
|
||||
|
||||
maxFeatures = min(static_cast<int>(img.size().area() * surf.keypointsRatio), 65535);
|
||||
maxCandidates = min(static_cast<int>(1.5 * maxFeatures), 65535);
|
||||
|
||||
CV_Assert(maxFeatures > 0);
|
||||
|
||||
counters.create(1, surf_.nOctaves + 1, CV_32SC1);
|
||||
counters.setTo(Scalar::all(0));
|
||||
|
||||
loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));
|
||||
|
||||
bindImgTex(img);
|
||||
|
||||
integralBuffered(img, surf_.sum, surf_.intBuffer);
|
||||
sumOffset = bindSumTex(surf_.sum);
|
||||
|
||||
if (use_mask)
|
||||
{
|
||||
min(mask, 1.0, surf_.mask1);
|
||||
integralBuffered(surf_.mask1, surf_.maskSum, surf_.intBuffer);
|
||||
maskOffset = bindMaskSumTex(surf_.maskSum);
|
||||
}
|
||||
}
|
||||
|
||||
void detectKeypoints(GpuMat& keypoints)
|
||||
{
|
||||
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
|
||||
ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
|
||||
|
||||
ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
|
||||
ensureSizeIsEnough(SURF_GPU::ROWS_COUNT, maxFeatures, CV_32FC1, keypoints);
|
||||
keypoints.setTo(Scalar::all(0));
|
||||
|
||||
for (int octave = 0; octave < surf_.nOctaves; ++octave)
|
||||
{
|
||||
const int layer_rows = img_rows >> octave;
|
||||
const int layer_cols = img_cols >> octave;
|
||||
|
||||
loadOctaveConstants(octave, layer_rows, layer_cols);
|
||||
|
||||
icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
|
||||
|
||||
icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
|
||||
img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);
|
||||
|
||||
unsigned int maxCounter;
|
||||
cudaSafeCall( cudaMemcpy(&maxCounter, counters.ptr<unsigned int>() + 1 + octave, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
maxCounter = std::min(maxCounter, static_cast<unsigned int>(maxCandidates));
|
||||
|
||||
if (maxCounter > 0)
|
||||
{
|
||||
icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer.ptr<int4>(), maxCounter,
|
||||
keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
|
||||
keypoints.ptr<int>(SURF_GPU::LAPLACIAN_ROW), keypoints.ptr<int>(SURF_GPU::OCTAVE_ROW),
|
||||
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::HESSIAN_ROW),
|
||||
counters.ptr<unsigned int>());
|
||||
}
|
||||
}
|
||||
unsigned int featureCounter;
|
||||
cudaSafeCall( cudaMemcpy(&featureCounter, counters.ptr<unsigned int>(), sizeof(unsigned int), cudaMemcpyDeviceToHost) );
|
||||
featureCounter = std::min(featureCounter, static_cast<unsigned int>(maxFeatures));
|
||||
|
||||
keypoints.cols = featureCounter;
|
||||
|
||||
if (surf_.upright)
|
||||
keypoints.row(SURF_GPU::ANGLE_ROW).setTo(Scalar::all(360.0 - 90.0));
|
||||
else
|
||||
findOrientation(keypoints);
|
||||
}
|
||||
|
||||
void findOrientation(GpuMat& keypoints)
|
||||
{
|
||||
const int nFeatures = keypoints.cols;
|
||||
if (nFeatures > 0)
|
||||
{
|
||||
icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
|
||||
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
void computeDescriptors(const GpuMat& keypoints, GpuMat& descriptors, int descriptorSize)
|
||||
{
|
||||
const int nFeatures = keypoints.cols;
|
||||
if (nFeatures > 0)
|
||||
{
|
||||
ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
|
||||
compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_GPU::X_ROW), keypoints.ptr<float>(SURF_GPU::Y_ROW),
|
||||
keypoints.ptr<float>(SURF_GPU::SIZE_ROW), keypoints.ptr<float>(SURF_GPU::ANGLE_ROW), nFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
SURF_GPU& surf_;
|
||||
|
||||
int img_cols, img_rows;
|
||||
|
||||
bool use_mask;
|
||||
|
||||
int maxCandidates;
|
||||
int maxFeatures;
|
||||
|
||||
size_t maskOffset;
|
||||
size_t sumOffset;
|
||||
|
||||
GpuMat counters;
|
||||
};
|
||||
}
|
||||
|
||||
cv::gpu::SURF_GPU::SURF_GPU()
|
||||
{
|
||||
hessianThreshold = 100;
|
||||
extended = true;
|
||||
nOctaves = 4;
|
||||
nOctaveLayers = 2;
|
||||
keypointsRatio = 0.01f;
|
||||
upright = false;
|
||||
}
|
||||
|
||||
cv::gpu::SURF_GPU::SURF_GPU(double _threshold, int _nOctaves, int _nOctaveLayers, bool _extended, float _keypointsRatio, bool _upright)
|
||||
{
|
||||
hessianThreshold = _threshold;
|
||||
extended = _extended;
|
||||
nOctaves = _nOctaves;
|
||||
nOctaveLayers = _nOctaveLayers;
|
||||
keypointsRatio = _keypointsRatio;
|
||||
upright = _upright;
|
||||
}
|
||||
|
||||
int cv::gpu::SURF_GPU::descriptorSize() const
|
||||
{
|
||||
return extended ? 128 : 64;
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::uploadKeypoints(const vector<KeyPoint>& keypoints, GpuMat& keypointsGPU)
|
||||
{
|
||||
if (keypoints.empty())
|
||||
keypointsGPU.release();
|
||||
else
|
||||
{
|
||||
Mat keypointsCPU(SURF_GPU::ROWS_COUNT, static_cast<int>(keypoints.size()), CV_32FC1);
|
||||
|
||||
float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
|
||||
float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
|
||||
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
|
||||
int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
|
||||
float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
|
||||
float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
|
||||
float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
|
||||
|
||||
for (size_t i = 0, size = keypoints.size(); i < size; ++i)
|
||||
{
|
||||
const KeyPoint& kp = keypoints[i];
|
||||
kp_x[i] = kp.pt.x;
|
||||
kp_y[i] = kp.pt.y;
|
||||
kp_octave[i] = kp.octave;
|
||||
kp_size[i] = kp.size;
|
||||
kp_dir[i] = kp.angle;
|
||||
kp_hessian[i] = kp.response;
|
||||
kp_laplacian[i] = 1;
|
||||
}
|
||||
|
||||
keypointsGPU.upload(keypointsCPU);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<KeyPoint>& keypoints)
|
||||
{
|
||||
const int nFeatures = keypointsGPU.cols;
|
||||
|
||||
if (nFeatures == 0)
|
||||
keypoints.clear();
|
||||
else
|
||||
{
|
||||
CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == ROWS_COUNT);
|
||||
|
||||
Mat keypointsCPU(keypointsGPU);
|
||||
|
||||
keypoints.resize(nFeatures);
|
||||
|
||||
float* kp_x = keypointsCPU.ptr<float>(SURF_GPU::X_ROW);
|
||||
float* kp_y = keypointsCPU.ptr<float>(SURF_GPU::Y_ROW);
|
||||
int* kp_laplacian = keypointsCPU.ptr<int>(SURF_GPU::LAPLACIAN_ROW);
|
||||
int* kp_octave = keypointsCPU.ptr<int>(SURF_GPU::OCTAVE_ROW);
|
||||
float* kp_size = keypointsCPU.ptr<float>(SURF_GPU::SIZE_ROW);
|
||||
float* kp_dir = keypointsCPU.ptr<float>(SURF_GPU::ANGLE_ROW);
|
||||
float* kp_hessian = keypointsCPU.ptr<float>(SURF_GPU::HESSIAN_ROW);
|
||||
|
||||
for (int i = 0; i < nFeatures; ++i)
|
||||
{
|
||||
KeyPoint& kp = keypoints[i];
|
||||
kp.pt.x = kp_x[i];
|
||||
kp.pt.y = kp_y[i];
|
||||
kp.class_id = kp_laplacian[i];
|
||||
kp.octave = kp_octave[i];
|
||||
kp.size = kp_size[i];
|
||||
kp.angle = kp_dir[i];
|
||||
kp.response = kp_hessian[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::downloadDescriptors(const GpuMat& descriptorsGPU, vector<float>& descriptors)
|
||||
{
|
||||
if (descriptorsGPU.empty())
|
||||
descriptors.clear();
|
||||
else
|
||||
{
|
||||
CV_Assert(descriptorsGPU.type() == CV_32F);
|
||||
|
||||
descriptors.resize(descriptorsGPU.rows * descriptorsGPU.cols);
|
||||
Mat descriptorsCPU(descriptorsGPU.size(), CV_32F, &descriptors[0]);
|
||||
descriptorsGPU.download(descriptorsCPU);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
|
||||
{
|
||||
if (!img.empty())
|
||||
{
|
||||
SURF_GPU_Invoker surf(*this, img, mask);
|
||||
|
||||
surf.detectKeypoints(keypoints);
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
|
||||
bool useProvidedKeypoints)
|
||||
{
|
||||
if (!img.empty())
|
||||
{
|
||||
SURF_GPU_Invoker surf(*this, img, mask);
|
||||
|
||||
if (!useProvidedKeypoints)
|
||||
surf.detectKeypoints(keypoints);
|
||||
else if (!upright)
|
||||
{
|
||||
surf.findOrientation(keypoints);
|
||||
}
|
||||
|
||||
surf.computeDescriptors(keypoints, descriptors, descriptorSize());
|
||||
}
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints)
|
||||
{
|
||||
GpuMat keypointsGPU;
|
||||
|
||||
(*this)(img, mask, keypointsGPU);
|
||||
|
||||
downloadKeypoints(keypointsGPU, keypoints);
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
|
||||
GpuMat& descriptors, bool useProvidedKeypoints)
|
||||
{
|
||||
GpuMat keypointsGPU;
|
||||
|
||||
if (useProvidedKeypoints)
|
||||
uploadKeypoints(keypoints, keypointsGPU);
|
||||
|
||||
(*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);
|
||||
|
||||
downloadKeypoints(keypointsGPU, keypoints);
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
|
||||
vector<float>& descriptors, bool useProvidedKeypoints)
|
||||
{
|
||||
GpuMat descriptorsGPU;
|
||||
|
||||
(*this)(img, mask, keypoints, descriptorsGPU, useProvidedKeypoints);
|
||||
|
||||
downloadDescriptors(descriptorsGPU, descriptors);
|
||||
}
|
||||
|
||||
void cv::gpu::SURF_GPU::releaseMemory()
|
||||
{
|
||||
sum.release();
|
||||
mask1.release();
|
||||
maskSum.release();
|
||||
intBuffer.release();
|
||||
det.release();
|
||||
trace.release();
|
||||
maxPosBuffer.release();
|
||||
}
|
||||
|
||||
#endif /* !defined (HAVE_CUDA) */
|
Reference in New Issue
Block a user