refactored gpu::BruteForceMatcher (moved some utility functions to device layer)
This commit is contained in:
parent
50b72197ab
commit
5f9e47a9cd
@ -78,60 +78,53 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector
|
||||
|
||||
namespace cv { namespace gpu { namespace bfmatcher
|
||||
{
|
||||
template <typename T>
|
||||
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void matchSingleHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void matchCollectionHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12, cudaStream_t stream);
|
||||
template <typename T> void matchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void knnMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
|
||||
template <typename T> void matchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void matchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T>
|
||||
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template <typename T>
|
||||
void radiusMatchHamming_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
template <typename T> void knnMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void knnMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
template <typename T> void knnMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
template <typename T> void radiusMatchL1_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream);
|
||||
template <typename T> void radiusMatchL2_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream);
|
||||
template <typename T> void radiusMatchHamming_gpu(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream);
|
||||
}}}
|
||||
|
||||
namespace
|
||||
{
|
||||
class ImgIdxSetter
|
||||
struct ImgIdxSetter
|
||||
{
|
||||
public:
|
||||
ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
|
||||
void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
|
||||
private:
|
||||
explicit inline ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
|
||||
inline void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
|
||||
int imgIdx;
|
||||
};
|
||||
}
|
||||
@ -179,9 +172,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
|
||||
|
||||
using namespace cv::gpu::bfmatcher;
|
||||
|
||||
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
|
||||
bool cc_12, cudaStream_t stream);
|
||||
typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
static const match_caller_t match_callers[3][8] =
|
||||
{
|
||||
@ -213,11 +206,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
|
||||
match_caller_t func = match_callers[distType][queryDescs.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
// For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
|
||||
// trainIdx store after imgIdx, so we doesn't lose it value.
|
||||
func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12, StreamAccessor::getStream(stream));
|
||||
func(queryDescs, trainDescs, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector<DMatch>& matches)
|
||||
@ -319,9 +311,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
|
||||
|
||||
using namespace cv::gpu::bfmatcher;
|
||||
|
||||
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
|
||||
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
|
||||
const DevMem2Df& distance, bool cc_12, cudaStream_t stream);
|
||||
typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection,
|
||||
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
static const match_caller_t match_callers[3][8] =
|
||||
{
|
||||
@ -353,9 +345,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
|
||||
match_caller_t func = match_callers[distType][queryDescs.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12, StreamAccessor::getStream(stream));
|
||||
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
|
||||
@ -427,8 +420,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
|
||||
|
||||
using namespace cv::gpu::bfmatcher;
|
||||
|
||||
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, bool cc_12, cudaStream_t stream);
|
||||
typedef void (*match_caller_t)(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
|
||||
int cc, cudaStream_t stream);
|
||||
|
||||
static const match_caller_t match_callers[3][8] =
|
||||
{
|
||||
@ -473,9 +467,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
|
||||
match_caller_t func = match_callers[distType][queryDescs.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
bool cc_12 = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
|
||||
DeviceInfo info;
|
||||
int cc = info.majorVersion() * 10 + info.minorVersion();
|
||||
|
||||
func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc_12, StreamAccessor::getStream(stream));
|
||||
func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
|
||||
@ -563,7 +558,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs,
|
||||
vector<DMatch>& localMatch = curMatches[queryIdx];
|
||||
vector<DMatch>& globalMatch = matches[queryIdx];
|
||||
|
||||
for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
|
||||
for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast<int>(imgIdx)));
|
||||
|
||||
temp.clear();
|
||||
merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));
|
||||
@ -593,8 +588,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
|
||||
|
||||
using namespace cv::gpu::bfmatcher;
|
||||
|
||||
typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
|
||||
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance, cudaStream_t stream);
|
||||
typedef void (*radiusMatch_caller_t)(const DevMem2D& query, const DevMem2D& train, float maxDistance, const DevMem2D& mask,
|
||||
const DevMem2D& trainIdx, const DevMem2D& nMatches, const DevMem2D& distance,
|
||||
cudaStream_t stream);
|
||||
|
||||
static const radiusMatch_caller_t radiusMatch_callers[3][8] =
|
||||
{
|
||||
@ -636,7 +632,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
|
||||
radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()];
|
||||
CV_Assert(func != 0);
|
||||
|
||||
func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance, StreamAccessor::getStream(stream));
|
||||
func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches,
|
||||
@ -728,7 +724,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
|
||||
vector<DMatch>& localMatch = curMatches[queryIdx];
|
||||
vector<DMatch>& globalMatch = matches[queryIdx];
|
||||
|
||||
for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
|
||||
for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(static_cast<int>(imgIdx)));
|
||||
|
||||
const size_t oldSize = globalMatch.size();
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -566,8 +566,8 @@ namespace cv { namespace gpu { namespace surf
|
||||
|
||||
float* s_sum_row = s_sum + threadIdx.y * 32;
|
||||
|
||||
warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
|
||||
warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
|
||||
reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
|
||||
reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
|
||||
|
||||
const float temp_mod = sumx * sumx + sumy * sumy;
|
||||
if (temp_mod > best_mod)
|
||||
|
@ -43,7 +43,7 @@
|
||||
#ifndef __OPENCV_GPU_COLOR_HPP__
|
||||
#define __OPENCV_GPU_COLOR_HPP__
|
||||
|
||||
#include "detail/color.hpp"
|
||||
#include "detail/color_detail.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
|
@ -44,7 +44,14 @@
|
||||
#define __OPENCV_GPU_DATAMOV_UTILS_HPP__
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "utility.hpp"
|
||||
|
||||
#if defined(_WIN64) || defined(__LP64__)
|
||||
// 64-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "l"
|
||||
#else
|
||||
// 32-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "r"
|
||||
#endif
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
|
186
modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
Normal file
186
modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
Normal file
@ -0,0 +1,186 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
|
||||
|
||||
#include "../vec_traits.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
|
||||
template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
|
||||
|
||||
template <typename T> struct IsSignedIntergral { enum {value = 0}; };
|
||||
template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
|
||||
template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
|
||||
template <> struct IsIntegral<char> { enum {value = 1}; };
|
||||
template <> struct IsIntegral<bool> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsFloat { enum {value = 0}; };
|
||||
template <> struct IsFloat<float> { enum {value = 1}; };
|
||||
template <> struct IsFloat<double> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsVec { enum {value = 0}; };
|
||||
template <> struct IsVec<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar8> { enum {value = 1}; };
|
||||
template <> struct IsVec<char1> { enum {value = 1}; };
|
||||
template <> struct IsVec<char2> { enum {value = 1}; };
|
||||
template <> struct IsVec<char3> { enum {value = 1}; };
|
||||
template <> struct IsVec<char4> { enum {value = 1}; };
|
||||
template <> struct IsVec<char8> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort2> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort3> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort4> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort8> { enum {value = 1}; };
|
||||
template <> struct IsVec<short1> { enum {value = 1}; };
|
||||
template <> struct IsVec<short2> { enum {value = 1}; };
|
||||
template <> struct IsVec<short3> { enum {value = 1}; };
|
||||
template <> struct IsVec<short4> { enum {value = 1}; };
|
||||
template <> struct IsVec<short8> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint8> { enum {value = 1}; };
|
||||
template <> struct IsVec<int1> { enum {value = 1}; };
|
||||
template <> struct IsVec<int2> { enum {value = 1}; };
|
||||
template <> struct IsVec<int3> { enum {value = 1}; };
|
||||
template <> struct IsVec<int4> { enum {value = 1}; };
|
||||
template <> struct IsVec<int8> { enum {value = 1}; };
|
||||
template <> struct IsVec<float1> { enum {value = 1}; };
|
||||
template <> struct IsVec<float2> { enum {value = 1}; };
|
||||
template <> struct IsVec<float3> { enum {value = 1}; };
|
||||
template <> struct IsVec<float4> { enum {value = 1}; };
|
||||
template <> struct IsVec<float8> { enum {value = 1}; };
|
||||
template <> struct IsVec<double1> { enum {value = 1}; };
|
||||
template <> struct IsVec<double2> { enum {value = 1}; };
|
||||
template <> struct IsVec<double3> { enum {value = 1}; };
|
||||
template <> struct IsVec<double4> { enum {value = 1}; };
|
||||
template <> struct IsVec<double8> { enum {value = 1}; };
|
||||
|
||||
template <class U> struct AddParameterType { typedef const U& type; };
|
||||
template <class U> struct AddParameterType<U&> { typedef U& type; };
|
||||
template <> struct AddParameterType<void> { typedef void type; };
|
||||
|
||||
template <class U> struct ReferenceTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct ReferenceTraits<U&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct PointerTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef void type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct UnConst
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnConst<const U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnConst<const U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template <class U> struct UnVolatile
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
|
576
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
Normal file
576
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
Normal file
@ -0,0 +1,576 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__
|
||||
#define __OPENCV_GPU_UTILITY_DETAIL_HPP__
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
|
||||
template <int n> struct WarpReductor
|
||||
{
|
||||
template <typename T, typename Op> static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
if (tid < n)
|
||||
data[tid] = partial_reduction;
|
||||
if (n > 32) __syncthreads();
|
||||
|
||||
if (n > 32)
|
||||
{
|
||||
if (tid < n - 32)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
|
||||
if (tid < 16)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 16)
|
||||
{
|
||||
if (tid < n - 16)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
if (tid < 8)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 8)
|
||||
{
|
||||
if (tid < n - 8)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
if (tid < 4)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 4)
|
||||
{
|
||||
if (tid < n - 4)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
if (tid < 2)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 2)
|
||||
{
|
||||
if (tid < n - 2)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
if (tid < 2)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<64>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
__syncthreads();
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<32>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<16>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
|
||||
if (tid < 8)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct WarpReductor<8>
|
||||
{
|
||||
template <typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
|
||||
if (tid < 4)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool warp> struct ReductionDispatcher;
|
||||
template <> struct ReductionDispatcher<true>
|
||||
{
|
||||
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
WarpReductor<n>::reduce(data, partial_reduction, tid, op);
|
||||
}
|
||||
};
|
||||
template <> struct ReductionDispatcher<false>
|
||||
{
|
||||
template <int n, typename T, typename Op> static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
if (tid < n)
|
||||
data[tid] = partial_reduction;
|
||||
__syncthreads();
|
||||
|
||||
|
||||
if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); }
|
||||
if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); }
|
||||
if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <int n> struct PredValWarpReductor;
|
||||
template <> struct PredValWarpReductor<64>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 32)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = sdata[tid + 32];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 32];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 16];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 16];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct PredValWarpReductor<32>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 16)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = sdata[tid + 16];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 16];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct PredValWarpReductor<16>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 8)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = reg = sdata[tid + 8];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
template <> struct PredValWarpReductor<8>
|
||||
{
|
||||
template <typename T, typename V, typename Pred>
|
||||
static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
if (tid < 4)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
T reg = reg = sdata[tid + 4];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 2];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
|
||||
reg = sdata[tid + 1];
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool warp> struct PredValReductionDispatcher;
|
||||
template <> struct PredValReductionDispatcher<true>
|
||||
{
|
||||
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
PredValWarpReductor<n>::reduce(myData, myVal, sdata, sval, tid, pred);
|
||||
}
|
||||
};
|
||||
template <> struct PredValReductionDispatcher<false>
|
||||
{
|
||||
template <int n, typename T, typename V, typename Pred> static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred)
|
||||
{
|
||||
myData = sdata[tid];
|
||||
myVal = sval[tid];
|
||||
|
||||
if (n >= 512 && tid < 256)
|
||||
{
|
||||
T reg = sdata[tid + 256];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 256];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (n >= 256 && tid < 128)
|
||||
{
|
||||
T reg = sdata[tid + 128];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 128];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if (n >= 128 && tid < 64)
|
||||
{
|
||||
T reg = sdata[tid + 64];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 64];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (n >= 64)
|
||||
{
|
||||
T reg = sdata[tid + 32];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 32];
|
||||
}
|
||||
}
|
||||
if (n >= 32)
|
||||
{
|
||||
T reg = sdata[tid + 16];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 16];
|
||||
}
|
||||
}
|
||||
if (n >= 16)
|
||||
{
|
||||
T reg = sdata[tid + 8];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 8];
|
||||
}
|
||||
}
|
||||
if (n >= 8)
|
||||
{
|
||||
T reg = sdata[tid + 4];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 4];
|
||||
}
|
||||
}
|
||||
if (n >= 4)
|
||||
{
|
||||
T reg = sdata[tid + 2];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 2];
|
||||
}
|
||||
}
|
||||
if (n >= 2)
|
||||
{
|
||||
T reg = sdata[tid + 1];
|
||||
|
||||
if (pred(reg, myData))
|
||||
{
|
||||
sdata[tid] = myData = reg;
|
||||
sval[tid] = myVal = sval[tid + 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Vector Distance
|
||||
|
||||
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
|
||||
{
|
||||
if (ind < len)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, ind, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, 0, val2);
|
||||
vecGlob += THREAD_DIM;
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
|
||||
}
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__
|
@ -55,7 +55,7 @@ namespace cv
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
(void)cta_buffer;
|
||||
return __ballot(predicat);
|
||||
return __ballot(predicate);
|
||||
#else
|
||||
int tid = threadIdx.x;
|
||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||
|
@ -47,6 +47,7 @@
|
||||
#include "internal_shared.hpp"
|
||||
#include "saturate_cast.hpp"
|
||||
#include "vec_traits.hpp"
|
||||
#include "type_traits.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
@ -57,55 +58,188 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
// Arithmetic Operations
|
||||
|
||||
using thrust::plus;
|
||||
using thrust::minus;
|
||||
using thrust::multiplies;
|
||||
using thrust::divides;
|
||||
using thrust::modulus;
|
||||
using thrust::negate;
|
||||
template <typename T> struct plus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a + b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct minus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a - b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct multiplies : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a * b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct divides : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a / b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct modulus : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a % b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct negate : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
|
||||
{
|
||||
return -a;
|
||||
}
|
||||
};
|
||||
|
||||
// Comparison Operations
|
||||
|
||||
using thrust::equal_to;
|
||||
using thrust::not_equal_to;
|
||||
using thrust::greater;
|
||||
using thrust::less;
|
||||
using thrust::greater_equal;
|
||||
using thrust::less_equal;
|
||||
template <typename T> struct equal_to : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a == b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct not_equal_to : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a != b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct greater : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a > b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct less : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a < b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct greater_equal : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a >= b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct less_equal : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a <= b;
|
||||
}
|
||||
};
|
||||
|
||||
// Logical Operations
|
||||
|
||||
using thrust::logical_and;
|
||||
using thrust::logical_or;
|
||||
using thrust::logical_not;
|
||||
template <typename T> struct logical_and : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a && b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct logical_or : binary_function<T, T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a || b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct logical_not : unary_function<T, bool>
|
||||
{
|
||||
__device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
|
||||
{
|
||||
return !a;
|
||||
}
|
||||
};
|
||||
|
||||
// Bitwise Operations
|
||||
|
||||
using thrust::bit_and;
|
||||
using thrust::bit_or;
|
||||
using thrust::bit_xor;
|
||||
template <typename T> struct bit_and : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a & b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_or : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a | b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_xor : binary_function<T, T, T>
|
||||
{
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
|
||||
{
|
||||
return a ^ b;
|
||||
}
|
||||
};
|
||||
template <typename T> struct bit_not : unary_function<T, T>
|
||||
{
|
||||
__forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
|
||||
{
|
||||
return ~v;
|
||||
}
|
||||
};
|
||||
|
||||
// Generalized Identity Operations
|
||||
|
||||
using thrust::identity;
|
||||
using thrust::project1st;
|
||||
using thrust::project2nd;
|
||||
template <typename T> struct identity : unary_function<T, T>
|
||||
{
|
||||
__device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
|
||||
{
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
|
||||
{
|
||||
__device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||
{
|
||||
return lhs;
|
||||
}
|
||||
};
|
||||
template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
|
||||
{
|
||||
__device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
|
||||
{
|
||||
return rhs;
|
||||
}
|
||||
};
|
||||
|
||||
// Min/Max Operations
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
|
||||
template <> struct name<type> : binary_function<type, type, type> \
|
||||
{ \
|
||||
__forceinline__ __device__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
|
||||
__device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
|
||||
};
|
||||
|
||||
template <typename T> struct maximum : binary_function<T, T, T>
|
||||
{
|
||||
__forceinline__ __device__ T operator()(const T& lhs, const T& rhs) const {return lhs < rhs ? rhs : lhs;}
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
{
|
||||
return lhs < rhs ? rhs : lhs;
|
||||
}
|
||||
};
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, max)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, max)
|
||||
@ -119,7 +253,10 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <typename T> struct minimum : binary_function<T, T, T>
|
||||
{
|
||||
__forceinline__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
|
||||
{
|
||||
return lhs < rhs ? lhs : rhs;
|
||||
}
|
||||
};
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, min)
|
||||
OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, min)
|
||||
@ -138,14 +275,14 @@ namespace cv { namespace gpu { namespace device
|
||||
#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
|
||||
template <typename T> struct func ## _func : unary_function<T, float> \
|
||||
{ \
|
||||
__forceinline__ __device__ float operator ()(const T& v) const \
|
||||
__device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
|
||||
{ \
|
||||
return func ## f(v); \
|
||||
} \
|
||||
}; \
|
||||
template <> struct func ## _func<double> : unary_function<double, double> \
|
||||
{ \
|
||||
__forceinline__ __device__ double operator ()(double v) const \
|
||||
__device__ __forceinline__ double operator ()(double v) const \
|
||||
{ \
|
||||
return func(v); \
|
||||
} \
|
||||
@ -153,14 +290,14 @@ namespace cv { namespace gpu { namespace device
|
||||
#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \
|
||||
template <typename T> struct func ## _func : binary_function<T, T, float> \
|
||||
{ \
|
||||
__forceinline__ __device__ float operator ()(const T& v1, const T& v2) const \
|
||||
__device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
|
||||
{ \
|
||||
return func ## f(v1, v2); \
|
||||
} \
|
||||
}; \
|
||||
template <> struct func ## _func<double> : binary_function<double, double, double> \
|
||||
{ \
|
||||
__forceinline__ __device__ double operator ()(double v1, double v2) const \
|
||||
__device__ __forceinline__ double operator ()(double v1, double v2) const \
|
||||
{ \
|
||||
return func(v1, v2); \
|
||||
} \
|
||||
@ -196,7 +333,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
|
||||
{
|
||||
__forceinline__ __device__ T operator ()(T src1, T src2) const
|
||||
__device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
|
||||
{
|
||||
return src1 * src1 + src2 * src2;
|
||||
}
|
||||
@ -206,7 +343,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
|
||||
{
|
||||
__forceinline__ __device__ D operator ()(const T& v) const
|
||||
__device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
|
||||
{
|
||||
return saturate_cast<D>(v);
|
||||
}
|
||||
@ -216,11 +353,11 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <typename T> struct thresh_binary_func : unary_function<T, T>
|
||||
{
|
||||
__forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
__host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
__forceinline__ __device__ T operator()(const T& src) const
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return src > thresh ? maxVal : 0;
|
||||
return (src > thresh) * maxVal;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
@ -228,11 +365,11 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
|
||||
{
|
||||
__forceinline__ __host__ __device__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
__host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
|
||||
|
||||
__forceinline__ __device__ T operator()(const T& src) const
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return src > thresh ? 0 : maxVal;
|
||||
return (src <= thresh) * maxVal;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
@ -240,9 +377,9 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
template <typename T> struct thresh_trunc_func : unary_function<T, T>
|
||||
{
|
||||
explicit __forceinline__ __host__ __device__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
__forceinline__ __device__ T operator()(const T& src) const
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return minimum<T>()(src, thresh);
|
||||
}
|
||||
@ -251,22 +388,22 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
template <typename T> struct thresh_to_zero_func : unary_function<T, T>
|
||||
{
|
||||
explicit __forceinline__ __host__ __device__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
__forceinline__ __device__ T operator()(const T& src) const
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return src > thresh ? src : 0;
|
||||
return (src > thresh) * src;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
};
|
||||
template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
|
||||
{
|
||||
explicit __forceinline__ __host__ __device__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
|
||||
|
||||
__forceinline__ __device__ T operator()(const T& src) const
|
||||
__device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
|
||||
{
|
||||
return src > thresh ? 0 : src;
|
||||
return (src <= thresh) * src;
|
||||
}
|
||||
|
||||
const T thresh;
|
||||
@ -274,17 +411,43 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
// Function Object Adaptors
|
||||
|
||||
using thrust::unary_negate;
|
||||
using thrust::not1;
|
||||
template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
|
||||
|
||||
using thrust::binary_negate;
|
||||
using thrust::not2;
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
|
||||
{
|
||||
return !pred(x);
|
||||
}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
|
||||
{
|
||||
return unary_negate<Predicate>(pred);
|
||||
}
|
||||
|
||||
template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
|
||||
{
|
||||
return !pred(x,y);
|
||||
}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
|
||||
{
|
||||
return binary_negate<BinaryPredicate>(pred);
|
||||
}
|
||||
|
||||
template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
|
||||
{
|
||||
__forceinline__ __host__ __device__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
|
||||
__host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
|
||||
|
||||
__forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a) const
|
||||
__device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
|
||||
{
|
||||
return op(arg1, a);
|
||||
}
|
||||
@ -292,15 +455,16 @@ namespace cv { namespace gpu { namespace device
|
||||
const Op op;
|
||||
const typename Op::first_argument_type arg1;
|
||||
};
|
||||
template <typename Op, typename T> static __forceinline__ __host__ __device__ binder1st<Op> bind1st(const Op& op, const T& x)
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
|
||||
{
|
||||
return binder1st<Op>(op, typename Op::first_argument_type(x));
|
||||
}
|
||||
|
||||
template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
|
||||
{
|
||||
__forceinline__ __host__ __device__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
|
||||
__host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
|
||||
|
||||
__forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a) const
|
||||
__forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
|
||||
{
|
||||
return op(a, arg2);
|
||||
}
|
||||
@ -308,7 +472,7 @@ namespace cv { namespace gpu { namespace device
|
||||
const Op op;
|
||||
const typename Op::second_argument_type arg2;
|
||||
};
|
||||
template <typename Op, typename T> static __forceinline__ __host__ __device__ binder2nd<Op> bind2nd(const Op& op, const T& x)
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
|
||||
{
|
||||
return binder2nd<Op>(op, typename Op::second_argument_type(x));
|
||||
}
|
||||
@ -317,24 +481,28 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
template <typename F> struct IsUnaryFunction
|
||||
{
|
||||
struct Yes {};
|
||||
typedef char Yes;
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T, typename D> static Yes check(unary_function<T, D>*);
|
||||
template <typename T, typename D> static Yes check(unary_function<T, D>);
|
||||
static No check(...);
|
||||
|
||||
enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
|
||||
static F makeF();
|
||||
|
||||
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
|
||||
};
|
||||
|
||||
template <typename F> struct IsBinaryFunction
|
||||
{
|
||||
struct Yes {};
|
||||
typedef char Yes;
|
||||
struct No {Yes a[2];};
|
||||
|
||||
template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>*);
|
||||
template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
|
||||
static No check(...);
|
||||
|
||||
enum { value = (sizeof(check((F*)0)) == sizeof(Yes)) };
|
||||
static F makeF();
|
||||
|
||||
enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
|
||||
};
|
||||
|
||||
namespace detail
|
||||
|
@ -47,29 +47,29 @@
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
|
||||
template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
|
||||
|
||||
template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
|
||||
{ return (uchar)max((int)v, 0); }
|
||||
template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
|
||||
{ return (uchar)min((uint)v, (uint)UCHAR_MAX); }
|
||||
template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
|
||||
{ return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
|
||||
template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
|
||||
{ return (uchar)min(v, (uint)UCHAR_MAX); }
|
||||
template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
|
||||
{ return saturate_cast<uchar>((uint)v); }
|
||||
|
||||
template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
|
||||
{ int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
|
||||
template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
|
||||
template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
|
||||
@ -78,23 +78,23 @@ namespace cv { namespace gpu { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> static __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
|
||||
{ return (schar)min((int)v, SCHAR_MAX); }
|
||||
template<> static __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
|
||||
{ return (schar)min((uint)v, (uint)SCHAR_MAX); }
|
||||
template<> static __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
|
||||
{
|
||||
return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
|
||||
v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
|
||||
}
|
||||
template<> static __device__ __forceinline__ schar saturate_cast<schar>(short v)
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
|
||||
{ return saturate_cast<schar>((int)v); }
|
||||
template<> static __device__ __forceinline__ schar saturate_cast<schar>(uint v)
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
|
||||
{ return (schar)min(v, (uint)SCHAR_MAX); }
|
||||
|
||||
template<> static __device__ __forceinline__ schar saturate_cast<schar>(float v)
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
|
||||
{ int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
|
||||
template<> static __device__ __forceinline__ schar saturate_cast<schar>(double v)
|
||||
template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
|
||||
@ -103,17 +103,17 @@ namespace cv { namespace gpu { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
|
||||
{ return (ushort)max((int)v, 0); }
|
||||
template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
|
||||
{ return (ushort)max((int)v, 0); }
|
||||
template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
|
||||
{ return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
|
||||
template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
|
||||
{ return (ushort)min(v, (uint)USHRT_MAX); }
|
||||
template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
|
||||
{ int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
|
||||
template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
|
||||
template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
|
||||
@ -122,18 +122,18 @@ namespace cv { namespace gpu { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> static __device__ __forceinline__ short saturate_cast<short>(ushort v)
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
|
||||
{ return (short)min((int)v, SHRT_MAX); }
|
||||
template<> static __device__ __forceinline__ short saturate_cast<short>(int v)
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(int v)
|
||||
{
|
||||
return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
|
||||
v : v > 0 ? SHRT_MAX : SHRT_MIN);
|
||||
}
|
||||
template<> static __device__ __forceinline__ short saturate_cast<short>(uint v)
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
|
||||
{ return (short)min(v, (uint)SHRT_MAX); }
|
||||
template<> static __device__ __forceinline__ short saturate_cast<short>(float v)
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(float v)
|
||||
{ int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
|
||||
template<> static __device__ __forceinline__ short saturate_cast<short>(double v)
|
||||
template<> __device__ __forceinline__ short saturate_cast<short>(double v)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
|
||||
int iv = __double2int_rn(v); return saturate_cast<short>(iv);
|
||||
@ -142,8 +142,8 @@ namespace cv { namespace gpu { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> static __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
|
||||
template<> static __device__ __forceinline__ int saturate_cast<int>(double v)
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
|
||||
template<> __device__ __forceinline__ int saturate_cast<int>(double v)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
|
||||
return __double2int_rn(v);
|
||||
@ -152,8 +152,8 @@ namespace cv { namespace gpu { namespace device
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> static __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
|
||||
template<> static __device__ __forceinline__ uint saturate_cast<uint>(double v)
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
|
||||
template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
|
||||
{
|
||||
#if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
|
||||
return __double2uint_rn(v);
|
||||
|
@ -43,33 +43,31 @@
|
||||
#ifndef __OPENCV_GPU_TRANSFORM_HPP__
|
||||
#define __OPENCV_GPU_TRANSFORM_HPP__
|
||||
|
||||
#include "detail/transform.hpp"
|
||||
#include "detail/transform_detail.hpp"
|
||||
#include "utility.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T, typename D, typename UnOp>
|
||||
static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
|
||||
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src, dst, op, detail::NoMask(), stream);
|
||||
detail::transform_caller(src, dst, op, WithOutMask(), stream);
|
||||
}
|
||||
template <typename T, typename D, typename UnOp>
|
||||
static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, const UnOp& op,
|
||||
cudaStream_t stream = 0)
|
||||
void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, const UnOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src, dst, op, detail::MaskReader(mask), stream);
|
||||
detail::transform_caller(src, dst, op, SingleMask(mask), stream);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp>
|
||||
static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst,
|
||||
const BinOp& op, cudaStream_t stream = 0)
|
||||
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src1, src2, dst, op, detail::NoMask(), stream);
|
||||
detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
|
||||
}
|
||||
template <typename T1, typename T2, typename D, typename BinOp>
|
||||
static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst,
|
||||
const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0)
|
||||
void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStep& mask, const BinOp& op, cudaStream_t stream = 0)
|
||||
{
|
||||
detail::transform_caller(src1, src2, dst, op, detail::MaskReader(mask), stream);
|
||||
detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
|
||||
}
|
||||
}}}
|
||||
|
||||
|
80
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
Normal file
80
modules/gpu/src/opencv2/gpu/device/type_traits.hpp
Normal file
@ -0,0 +1,80 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
|
||||
#define __OPENCV_GPU_TYPE_TRAITS_HPP__
|
||||
|
||||
#include "detail/type_traits_detail.hpp"
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> struct IsSimpleParameter
|
||||
{
|
||||
enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
|
||||
};
|
||||
|
||||
template <typename T> struct TypeTraits
|
||||
{
|
||||
typedef typename detail::UnConst<T>::type NonConstType;
|
||||
typedef typename detail::UnVolatile<T>::type NonVolatileType;
|
||||
typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type UnqualifiedType;
|
||||
typedef typename detail::PointerTraits<UnqualifiedType>::type PointeeType;
|
||||
typedef typename detail::ReferenceTraits<T>::type ReferredType;
|
||||
|
||||
enum { isConst = detail::UnConst<T>::value };
|
||||
enum { isVolatile = detail::UnVolatile<T>::value };
|
||||
|
||||
enum { isReference = detail::ReferenceTraits<UnqualifiedType>::value };
|
||||
enum { isPointer = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };
|
||||
|
||||
enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };
|
||||
enum { isSignedInt = detail::IsSignedIntergral<UnqualifiedType>::value };
|
||||
enum { isIntegral = detail::IsIntegral<UnqualifiedType>::value };
|
||||
enum { isFloat = detail::IsFloat<UnqualifiedType>::value };
|
||||
enum { isArith = isIntegral || isFloat };
|
||||
enum { isVec = detail::IsVec<UnqualifiedType>::value };
|
||||
|
||||
typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;
|
||||
};
|
||||
}}}
|
||||
|
||||
#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
|
@ -45,112 +45,275 @@
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "saturate_cast.hpp"
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#define __CUDA_ARCH__ 0
|
||||
#endif
|
||||
#include "datamov_utils.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "detail/utility_detail.hpp"
|
||||
|
||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
||||
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
||||
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
||||
|
||||
#if defined(_WIN64) || defined(__LP64__)
|
||||
// 64-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "l"
|
||||
#else
|
||||
// 32-bit register modifier for inlined asm
|
||||
#define OPENCV_GPU_ASM_PTR "r"
|
||||
#endif
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
template <typename T> void __host__ __device__ __forceinline__ swap(T& a, T& b)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// swap
|
||||
|
||||
template <typename T> void __device__ __forceinline__ swap(T& a, T& b)
|
||||
{
|
||||
const T temp = a;
|
||||
a = b;
|
||||
b = temp;
|
||||
}
|
||||
|
||||
// warp-synchronous 32 elements reduction
|
||||
template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Mask Reader
|
||||
|
||||
if (tid < 16)
|
||||
struct SingleMask
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
|
||||
}
|
||||
explicit __host__ __device__ __forceinline__ SingleMask(const PtrStep& mask_) : mask(mask_) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
{
|
||||
return mask.ptr(y)[x] != 0;
|
||||
}
|
||||
|
||||
// warp-synchronous 16 elements reduction
|
||||
template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
data[tid] = partial_reduction;
|
||||
const PtrStep mask;
|
||||
};
|
||||
|
||||
if (tid < 8)
|
||||
struct MaskCollection
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 8 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 4 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 2 ]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 1 ]);
|
||||
explicit __host__ __device__ __forceinline__ MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
|
||||
|
||||
__device__ __forceinline__ void next()
|
||||
{
|
||||
curMask = *maskCollection++;
|
||||
}
|
||||
__device__ __forceinline__ void setMask(int z)
|
||||
{
|
||||
curMask = maskCollection[z];
|
||||
}
|
||||
|
||||
// warp-synchronous reduction
|
||||
template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
{
|
||||
if (tid < n)
|
||||
data[tid] = partial_reduction;
|
||||
uchar val;
|
||||
return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
|
||||
}
|
||||
|
||||
if (n > 16)
|
||||
const PtrStep* maskCollection;
|
||||
PtrStep curMask;
|
||||
};
|
||||
|
||||
struct WithOutMask
|
||||
{
|
||||
if (tid < n - 16)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
|
||||
if (tid < 8)
|
||||
__device__ __forceinline__ void next() const
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
}
|
||||
}
|
||||
else if (n > 8)
|
||||
__device__ __forceinline__ void setMask(int) const
|
||||
{
|
||||
if (tid < n - 8)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]);
|
||||
if (tid < 4)
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int, int) const
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else if (n > 4)
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Reduction
|
||||
|
||||
// reduction
|
||||
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
|
||||
{
|
||||
if (tid < n - 4)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]);
|
||||
if (tid < 2)
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
|
||||
}
|
||||
|
||||
template <int n, typename T, typename V, typename Pred>
|
||||
__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
StaticAssert<n >= 8 && n <= 512>::check();
|
||||
detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
|
||||
}
|
||||
}
|
||||
else if (n > 2)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Vector Distance
|
||||
|
||||
template <typename T> struct L1Dist
|
||||
{
|
||||
if (tid < n - 2)
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]);
|
||||
if (tid < 2)
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
|
||||
__device__ __forceinline__ L1Dist() : mySum(0) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(int val1, int val2)
|
||||
{
|
||||
data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]);
|
||||
mySum = __sad(val1, val2, mySum);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
|
||||
int mySum;
|
||||
};
|
||||
template <> struct L1Dist<float>
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
|
||||
__device__ __forceinline__ L1Dist() : mySum(0.0f) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(float val1, float val2)
|
||||
{
|
||||
mySum += ::fabs(val1 - val2);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
|
||||
float mySum;
|
||||
};
|
||||
|
||||
struct L2Dist
|
||||
{
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
|
||||
__device__ __forceinline__ L2Dist() : mySum(0.0f) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(float val1, float val2)
|
||||
{
|
||||
float reg = val1 - val2;
|
||||
mySum += reg * reg;
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator float() const
|
||||
{
|
||||
return sqrtf(mySum);
|
||||
}
|
||||
|
||||
float mySum;
|
||||
};
|
||||
|
||||
struct HammingDist
|
||||
{
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
|
||||
__device__ __forceinline__ HammingDist() : mySum(0) {}
|
||||
|
||||
__device__ __forceinline__ void reduceIter(int val1, int val2)
|
||||
{
|
||||
mySum += __popc(val1 ^ val2);
|
||||
}
|
||||
|
||||
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
|
||||
{
|
||||
reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
|
||||
}
|
||||
|
||||
__device__ __forceinline__ operator int() const
|
||||
{
|
||||
return mySum;
|
||||
}
|
||||
|
||||
int mySum;
|
||||
};
|
||||
|
||||
// calc distance between two vectors in global memory
|
||||
template <int THREAD_DIM, typename Dist, typename T1, typename T2>
|
||||
__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||
{
|
||||
for (int i = tid; i < len; i += THREAD_DIM)
|
||||
{
|
||||
T1 val1;
|
||||
ForceGlob<T1>::Load(vec1, i, val1);
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vec2, i, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
}
|
||||
|
||||
dist.reduceAll<THREAD_DIM>(smem, tid);
|
||||
}
|
||||
|
||||
// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
|
||||
__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
|
||||
{
|
||||
detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
|
||||
|
||||
dist.reduceAll<THREAD_DIM>(smem, tid);
|
||||
}
|
||||
|
||||
// calc distance between two vectors in global memory
|
||||
template <int THREAD_DIM, typename T1> struct VecDiffGlobal
|
||||
{
|
||||
explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
|
||||
{
|
||||
vec1 = vec1_;
|
||||
}
|
||||
|
||||
template <typename T2, typename Dist>
|
||||
__device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
|
||||
{
|
||||
calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
|
||||
}
|
||||
|
||||
const T1* vec1;
|
||||
};
|
||||
|
||||
// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
|
||||
{
|
||||
template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
|
||||
{
|
||||
if (glob_tid < len)
|
||||
smem[glob_tid] = vec1[glob_tid];
|
||||
__syncthreads();
|
||||
|
||||
U* vec1ValsPtr = vec1Vals;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
|
||||
*vec1ValsPtr++ = smem[i];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename T2, typename Dist>
|
||||
__device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
|
||||
{
|
||||
calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
|
||||
}
|
||||
|
||||
U vec1Vals[MAX_LEN / THREAD_DIM];
|
||||
};
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Solve linear system
|
||||
|
||||
// solve 2x2 linear system Ax=b
|
||||
template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
|
||||
{
|
||||
|
@ -55,7 +55,7 @@ namespace cv { namespace gpu { namespace device
|
||||
template <int cn, typename VecD> struct SatCastHelper;
|
||||
template <typename VecD> struct SatCastHelper<1, VecD>
|
||||
{
|
||||
template <typename VecS> static __device__ VecD cast(const VecS& v)
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x));
|
||||
@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<2, VecD>
|
||||
{
|
||||
template <typename VecS> static __device__ VecD cast(const VecS& v)
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
|
||||
@ -71,7 +71,7 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<3, VecD>
|
||||
{
|
||||
template <typename VecS> static __device__ VecD cast(const VecS& v)
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
|
||||
@ -79,72 +79,72 @@ namespace cv { namespace gpu { namespace device
|
||||
};
|
||||
template <typename VecD> struct SatCastHelper<4, VecD>
|
||||
{
|
||||
template <typename VecS> static __device__ VecD cast(const VecS& v)
|
||||
template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
|
||||
{
|
||||
typedef typename VecTraits<VecD>::elem_type D;
|
||||
return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
|
||||
template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
|
||||
{
|
||||
return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
|
||||
static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
|
||||
@ -195,70 +195,70 @@ namespace cv { namespace gpu { namespace device
|
||||
}
|
||||
|
||||
#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
|
||||
static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
|
||||
} \
|
||||
static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
|
||||
__device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
|
||||
{ \
|
||||
func<type> f; \
|
||||
return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<type, T>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
|
||||
} \
|
||||
template <typename T> \
|
||||
static __device__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
|
||||
__device__ __forceinline__ typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
|
||||
{ \
|
||||
func<typename detail::BinOpTraits<T, type>::argument_type> f; \
|
||||
return VecTraits<typename TypeVec<typename func<typename detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
|
||||
|
@ -3642,19 +3642,24 @@ INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplateBlackSource, testing::Combine(
|
||||
testing::Values((int)CV_TM_CCOEFF_NORMED, (int)CV_TM_CCORR_NORMED)));
|
||||
|
||||
|
||||
struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, std::tr1::tuple<const char*, const char*> > >
|
||||
struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, std::pair<std::string, std::string> > >
|
||||
{
|
||||
cv::gpu::DeviceInfo devInfo;
|
||||
std::string imageName;
|
||||
std::string patternName;
|
||||
|
||||
cv::Mat image, pattern;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
devInfo = std::tr1::get<0>(GetParam());
|
||||
imageName = std::tr1::get<1>(GetParam()).first;
|
||||
patternName = std::tr1::get<1>(GetParam()).second;
|
||||
|
||||
image = readImage(std::tr1::get<0>(std::tr1::get<1>(GetParam())));
|
||||
image = readImage(imageName);
|
||||
ASSERT_FALSE(image.empty());
|
||||
|
||||
pattern = readImage(std::tr1::get<1>(std::tr1::get<1>(GetParam())));
|
||||
pattern = readImage(patternName);
|
||||
ASSERT_FALSE(pattern.empty());
|
||||
}
|
||||
};
|
||||
@ -3662,6 +3667,8 @@ struct MatchTemplate_CCOEF_NORMED : testing::TestWithParam< std::tr1::tuple<cv::
|
||||
TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
|
||||
{
|
||||
PRINT_PARAM(devInfo);
|
||||
PRINT_PARAM(imageName);
|
||||
PRINT_PARAM(patternName);
|
||||
|
||||
cv::Mat dstGold;
|
||||
cv::matchTemplate(image, pattern, dstGold, CV_TM_CCOEFF_NORMED);
|
||||
@ -3688,8 +3695,8 @@ TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ImgProc, MatchTemplate_CCOEF_NORMED, testing::Combine(
|
||||
testing::ValuesIn(devices()),
|
||||
testing::Values(std::tr1::make_tuple("matchtemplate/source-0.png", "matchtemplate/target-0.png"),
|
||||
std::tr1::make_tuple("matchtemplate/source-1.png", "matchtemplate/target-1.png"))));
|
||||
testing::Values(std::make_pair(std::string("matchtemplate/source-0.png"), std::string("matchtemplate/target-0.png")),
|
||||
std::make_pair(std::string("matchtemplate/source-1.png"), std::string("matchtemplate/target-1.png")))));
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -286,7 +286,7 @@ TEST(BruteForceMatcher)
|
||||
{
|
||||
// Init CPU matcher
|
||||
|
||||
int desc_len = 128;
|
||||
int desc_len = 64;
|
||||
|
||||
BruteForceMatcher< L2<float> > matcher;
|
||||
|
||||
@ -329,7 +329,7 @@ TEST(BruteForceMatcher)
|
||||
GPU_OFF;
|
||||
|
||||
SUBTEST << "radiusMatch";
|
||||
float max_distance = 3.8f;
|
||||
float max_distance = 2.0f;
|
||||
|
||||
CPU_ON;
|
||||
matcher.radiusMatch(query, train, matches, max_distance);
|
||||
|
Loading…
x
Reference in New Issue
Block a user