merged GPU scan
This commit is contained in:
parent
6cca6a4548
commit
2777ebb8a0
@ -90,6 +90,40 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
|
||||
Interpolation(cv::INTER_CUBIC), Interpolation(cv::INTER_AREA)),
|
||||
testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
|
||||
|
||||
GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
|
||||
{
|
||||
cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
|
||||
cv::gpu::setDevice(devInfo.deviceID());
|
||||
|
||||
cv::Size size = GET_PARAM(1);
|
||||
int type = GET_PARAM(2);
|
||||
int interpolation = cv::INTER_AREA;
|
||||
double f = GET_PARAM(3);
|
||||
|
||||
cv::Mat src_host(size, type);
|
||||
fill(src_host, 0, 255);
|
||||
|
||||
cv::gpu::GpuMat src(src_host);
|
||||
cv::gpu::GpuMat dst;
|
||||
|
||||
cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
|
||||
|
||||
declare.time(1.0);
|
||||
|
||||
TEST_CYCLE()
|
||||
{
|
||||
cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
|
||||
ALL_DEVICES,
|
||||
testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/),
|
||||
testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
|
||||
MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
|
||||
MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
|
||||
testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// WarpAffine
|
||||
|
||||
|
@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
|
||||
|
||||
struct Mask8U
|
||||
{
|
||||
explicit Mask8U(PtrStepb mask): mask(mask) {}
|
||||
explicit Mask8U(PtrStepb mask_): mask(mask_) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(int y, int x) const
|
||||
{
|
||||
|
@ -46,7 +46,8 @@
|
||||
#include "opencv2/gpu/device/vec_math.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
#include "opencv2/gpu/device/filters.hpp"
|
||||
# include <cfloat>
|
||||
#include <cfloat>
|
||||
#include <opencv2/gpu/device/scan.hpp>
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
|
@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC2_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
dst.rows, dst.cols, dst.data, dst.step);
|
||||
@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC3_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
|
||||
mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
|
||||
mergeC4_<T><<<grid, block, 0, stream>>>(
|
||||
src[0].data, src[0].step,
|
||||
src[1].data, src[1].step,
|
||||
src[2].data, src[2].step,
|
||||
@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC2_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step);
|
||||
@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC3_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
|
||||
template <typename T>
|
||||
static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
|
||||
{
|
||||
dim3 blockDim(32, 8);
|
||||
dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
|
||||
splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
|
||||
splitC4_<T><<<grid, block, 0, stream>>>(
|
||||
src.data, src.step, src.rows, src.cols,
|
||||
dst[0].data, dst[0].step,
|
||||
dst[1].data, dst[1].step,
|
||||
|
@ -1121,18 +1121,18 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
|
||||
dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
|
||||
dim3 p_threads(32, 8);
|
||||
|
||||
NcvSize32u srcSize (kLevelWidth, kLevelHeight);
|
||||
NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);
|
||||
NcvSize32u dstSize (nw, nh);
|
||||
NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
|
||||
NcvRect32u dstROI (0, 0, nw, nh);
|
||||
|
||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
|
||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
|
||||
ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
||||
|
||||
ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||
ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
|
||||
|
||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI,
|
||||
ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
|
||||
ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
|
||||
|
||||
ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
|
||||
|
@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
|
||||
//===================================================================
|
||||
|
||||
|
||||
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
|
||||
NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_)
|
||||
:
|
||||
currentSize(0),
|
||||
_maxSize(0),
|
||||
@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
|
||||
begin(NULL),
|
||||
end(NULL),
|
||||
_memType(NCVMemoryTypeNone),
|
||||
_alignment(alignment),
|
||||
_alignment(alignment_),
|
||||
bReusesMemory(false)
|
||||
{
|
||||
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
|
||||
NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
|
||||
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
|
||||
}
|
||||
|
||||
|
||||
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
|
||||
NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr)
|
||||
:
|
||||
currentSize(0),
|
||||
_maxSize(0),
|
||||
allocBegin(NULL),
|
||||
_memType(memT),
|
||||
_alignment(alignment)
|
||||
_alignment(alignment_)
|
||||
{
|
||||
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
|
||||
NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
|
||||
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
|
||||
ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
|
||||
|
||||
@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
|
||||
//===================================================================
|
||||
|
||||
|
||||
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
|
||||
NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_)
|
||||
:
|
||||
currentSize(0),
|
||||
_maxSize(0),
|
||||
_memType(memT),
|
||||
_alignment(alignment)
|
||||
_alignment(alignment_)
|
||||
{
|
||||
ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
|
||||
}
|
||||
|
@ -49,17 +49,17 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
struct Emulation
|
||||
{
|
||||
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
||||
{
|
||||
static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
(void)cta_buffer;
|
||||
return __ballot(predicate);
|
||||
(void)cta_buffer;
|
||||
return __ballot(predicate);
|
||||
#else
|
||||
int tid = threadIdx.x;
|
||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||
return warp_reduce(cta_buffer);
|
||||
int tid = threadIdx.x;
|
||||
cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
|
||||
return warp_reduce(cta_buffer);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
}}} // namespace cv { namespace gpu { namespace device
|
||||
|
||||
|
@ -416,6 +416,8 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
return src1 * src1 + src2 * src2;
|
||||
}
|
||||
__device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
|
||||
__device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
|
||||
};
|
||||
|
||||
// Saturate Cast Functor
|
||||
@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
return (src > thresh) * maxVal;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
|
||||
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
|
||||
|
||||
@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
return (src <= thresh) * maxVal;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
|
||||
: unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
|
||||
|
||||
@ -523,8 +527,12 @@ namespace cv { namespace gpu { namespace device
|
||||
return !pred(x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
|
||||
__device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
|
||||
template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
|
||||
{
|
||||
return unary_negate<Predicate>(pred);
|
||||
@ -534,13 +542,20 @@ namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
|
||||
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
|
||||
__device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
|
||||
typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
|
||||
{
|
||||
return !pred(x,y);
|
||||
}
|
||||
__device__ __forceinline__ binary_negate(const binary_negate& other)
|
||||
: binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
|
||||
|
||||
__device__ __forceinline__ binary_negate() :
|
||||
binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
|
||||
|
||||
const Predicate pred;
|
||||
};
|
||||
|
||||
template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
|
||||
{
|
||||
return binary_negate<BinaryPredicate>(pred);
|
||||
@ -555,9 +570,13 @@ namespace cv { namespace gpu { namespace device
|
||||
return op(arg1, a);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ binder1st(const binder1st& other) :
|
||||
unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
|
||||
|
||||
const Op op;
|
||||
const typename Op::first_argument_type arg1;
|
||||
};
|
||||
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
|
||||
{
|
||||
return binder1st<Op>(op, typename Op::first_argument_type(x));
|
||||
@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
|
||||
return op(a, arg2);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ binder2nd(const binder2nd& other) :
|
||||
unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
|
||||
|
||||
const Op op;
|
||||
const typename Op::second_argument_type arg2;
|
||||
};
|
||||
|
||||
template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
|
||||
{
|
||||
return binder2nd<Op>(op, typename Op::second_argument_type(x));
|
||||
}
|
||||
|
||||
// Functor Traits
|
||||
|
||||
template <typename F> struct IsUnaryFunction
|
||||
{
|
||||
typedef char Yes;
|
||||
|
166
modules/gpu/src/opencv2/gpu/device/scan.hpp
Normal file
166
modules/gpu/src/opencv2/gpu/device/scan.hpp
Normal file
@ -0,0 +1,166 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_GPU_SCAN_HPP__
|
||||
#define __OPENCV_GPU_SCAN_HPP__
|
||||
|
||||
enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 };
|
||||
|
||||
template <ScanKind Kind, typename T, typename F> struct WarpScan
|
||||
{
|
||||
__device__ __forceinline__ WarpScan() {}
|
||||
__device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
|
||||
|
||||
__device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
|
||||
{
|
||||
const unsigned int lane = idx & 31;
|
||||
F op;
|
||||
|
||||
if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
|
||||
if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
|
||||
if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
|
||||
if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
|
||||
if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
|
||||
|
||||
if( Kind == INCLUSIVE )
|
||||
return ptr [idx];
|
||||
else
|
||||
return (lane > 0) ? ptr [idx - 1] : 0;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ unsigned int index(const unsigned int tid)
|
||||
{
|
||||
return tid;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void init(volatile T *ptr){}
|
||||
|
||||
static const int warp_offset = 0;
|
||||
|
||||
typedef WarpScan<INCLUSIVE, T, F> merge;
|
||||
};
|
||||
|
||||
template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
|
||||
{
|
||||
__device__ __forceinline__ WarpScanNoComp() {}
|
||||
__device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
|
||||
|
||||
__device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
|
||||
{
|
||||
const unsigned int lane = threadIdx.x & 31;
|
||||
F op;
|
||||
|
||||
ptr [idx ] = op(ptr [idx - 1], ptr [idx]);
|
||||
ptr [idx ] = op(ptr [idx - 2], ptr [idx]);
|
||||
ptr [idx ] = op(ptr [idx - 4], ptr [idx]);
|
||||
ptr [idx ] = op(ptr [idx - 8], ptr [idx]);
|
||||
ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
|
||||
|
||||
if( Kind == INCLUSIVE )
|
||||
return ptr [idx];
|
||||
else
|
||||
return (lane > 0) ? ptr [idx - 1] : 0;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ unsigned int index(const unsigned int tid)
|
||||
{
|
||||
return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void init(volatile T *ptr)
|
||||
{
|
||||
ptr[threadIdx.x] = 0;
|
||||
}
|
||||
|
||||
static const int warp_smem_stride = 32 + 16 + 1;
|
||||
static const int warp_offset = 16;
|
||||
static const int warp_log = 5;
|
||||
static const int warp_mask = 31;
|
||||
|
||||
typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
|
||||
};
|
||||
|
||||
template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
|
||||
{
|
||||
__device__ __forceinline__ BlockScan() {}
|
||||
__device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
|
||||
|
||||
__device__ __forceinline__ T operator()(volatile T *ptr)
|
||||
{
|
||||
const unsigned int tid = threadIdx.x;
|
||||
const unsigned int lane = tid & warp_mask;
|
||||
const unsigned int warp = tid >> warp_log;
|
||||
|
||||
Sc scan;
|
||||
typename Sc::merge merge_scan;
|
||||
const unsigned int idx = scan.index(tid);
|
||||
|
||||
T val = scan(ptr, idx);
|
||||
__syncthreads ();
|
||||
|
||||
if( warp == 0)
|
||||
scan.init(ptr);
|
||||
__syncthreads ();
|
||||
|
||||
if( lane == 31 )
|
||||
ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
|
||||
__syncthreads ();
|
||||
|
||||
if( warp == 0 )
|
||||
merge_scan(ptr, idx);
|
||||
__syncthreads();
|
||||
|
||||
if ( warp > 0)
|
||||
val = ptr [scan.warp_offset + warp - 1] + val;
|
||||
__syncthreads ();
|
||||
|
||||
ptr[idx] = val;
|
||||
__syncthreads ();
|
||||
|
||||
return val ;
|
||||
}
|
||||
|
||||
static const int warp_log = 5;
|
||||
static const int warp_mask = 31;
|
||||
};
|
||||
|
||||
#endif
|
@ -67,12 +67,12 @@ namespace cv { namespace gpu { namespace device
|
||||
enum { isReference = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
|
||||
enum { isPointer = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
|
||||
|
||||
enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
|
||||
enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
|
||||
enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };
|
||||
enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };
|
||||
enum { isArith = isIntegral || isFloat };
|
||||
enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };
|
||||
enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
|
||||
enum { isSignedInt = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
|
||||
enum { isIntegral = type_traits_detail::IsIntegral<UnqualifiedType>::value };
|
||||
enum { isFloat = type_traits_detail::IsFloat<UnqualifiedType>::value };
|
||||
enum { isArith = isIntegral || isFloat };
|
||||
enum { isVec = type_traits_detail::IsVec<UnqualifiedType>::value };
|
||||
|
||||
typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
|
||||
T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
|
||||
|
@ -49,8 +49,8 @@
|
||||
|
||||
namespace cv { namespace gpu { namespace device
|
||||
{
|
||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
||||
#define OPENCV_GPU_LOG_WARP_SIZE (5)
|
||||
#define OPENCV_GPU_WARP_SIZE (1 << OPENCV_GPU_LOG_WARP_SIZE)
|
||||
#define OPENCV_GPU_LOG_MEM_BANKS ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
|
||||
#define OPENCV_GPU_MEM_BANKS (1 << OPENCV_GPU_LOG_MEM_BANKS)
|
||||
|
||||
|
@ -44,7 +44,32 @@
|
||||
|
||||
#ifndef HAVE_CUDA
|
||||
|
||||
void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
|
||||
{
|
||||
(void)src;
|
||||
(void)dst;
|
||||
(void)dsize;
|
||||
(void)fx;
|
||||
(void)fy;
|
||||
(void)interpolation;
|
||||
(void)s;
|
||||
|
||||
throw_nogpu();
|
||||
}
|
||||
void cv::gpu::resize(const GpuMat& src, GpuMat& dst,GpuMat& buffer, Size dsize,
|
||||
double fx, double fy, int interpolation, Stream& s)
|
||||
{
|
||||
(void)src;
|
||||
(void)dst;
|
||||
(void)dsize;
|
||||
(void)fx;
|
||||
(void)fy;
|
||||
(void)interpolation;
|
||||
(void)buffer;
|
||||
(void)s;
|
||||
|
||||
throw_nogpu();
|
||||
}
|
||||
|
||||
#else // HAVE_CUDA
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user